前言
由于害怕博客再次爆炸
, 顺便学习Python, 决定试着写一个博客爬虫。
以后定时爬博客进行备份。
代码
确实没什么好写的…Python非常简单, 搞搞就好了。
主要原理就是下载html到本地, 然后顺着链接dfs过去, 判断是否在站内+解码之类就行了。
代码:
import urllib.request import re suffix_allowed = ('.jpg', '.jpeg', '.png', '.ppt', '.doc', '.docx', '.pdf', '.gif', '.wav', '.ico', '.mp3', '.mp4', '.txt', '.pptx', '.bmp', '.tiff', '.ini', '.inf', '.log', '.html') def checkUrl(url): if url.count('?') == 0 and url.count('#') == 0: return True return False def translateUrl(url): global suffix_allowed if url[0:7] == 'http://': url = url[7:len(url)] url = url.replace('/', '.') flag = True for suf in suffix_allowed: if url.endswith(suf): flag = False break if url.endswith('feed'): url += '.xml' elif flag: url += '.html' return url def hashString(s): hashval = 0 for chr in s: hashval = hashval*97+ord(chr) hashval = hashval % 998442353 return hashval urlhead = str() readed_urls = set() cnt_pages = 0 cnt_saved_pages = 0 def getPage(url): global cnt_pages global cnt_saved_pages global readed_urls global urlhead cnt_pages+=1 if not checkUrl(url): return if hashString(url) in readed_urls: return readed_urls.add(hashString(url)) print('at url: \"'+url+'\"') if url[0:len(urlhead)] != urlhead[:]: print('out of range, crawing cancelled') return try: connect = urllib.request.urlopen(url, None, 10000) htmldoc = connect.read().decode(u'utf-8', 'ignore'); tosave = open(translateUrl(url), 'w', -1, 'utf-8') tosave.write(htmldoc) tosave.close() cnt_saved_pages+=1 print('url: "'+url+'" saved.') matchobj = re.compile('href="(.*?)"') urllist = re.findall(matchobj, htmldoc) for nexturl in urllist: getPage(nexturl) except urllib.error.URLError: print('failed at url: \"'+url+'\"') return return urlhead = input("Please input wpsite url:") cnt_pages = 0 readed_urls.clear() getPage(urlhead) print('Crawing finished.') print(str(cnt_pages)+' visits. '+str(cnt_saved_pages)+' pages saved.')
Join the discussion