前言
由于害怕博客再次爆炸
, 顺便学习Python, 决定试着写一个博客爬虫。
以后定时爬博客进行备份。
代码
确实没什么好写的…Python非常简单, 搞搞就好了。
主要原理就是下载html到本地, 然后顺着链接dfs过去, 判断是否在站内+解码之类就行了。
代码:
import urllib.request
import re
suffix_allowed = ('.jpg', '.jpeg', '.png', '.ppt', '.doc', '.docx', '.pdf', '.gif', '.wav', '.ico', '.mp3', '.mp4', '.txt', '.pptx', '.bmp', '.tiff', '.ini', '.inf', '.log', '.html')
def checkUrl(url):
if url.count('?') == 0 and url.count('#') == 0:
return True
return False
def translateUrl(url):
global suffix_allowed
if url[0:7] == 'http://':
url = url[7:len(url)]
url = url.replace('/', '.')
flag = True
for suf in suffix_allowed:
if url.endswith(suf):
flag = False
break
if url.endswith('feed'):
url += '.xml'
elif flag:
url += '.html'
return url
def hashString(s):
hashval = 0
for chr in s:
hashval = hashval*97+ord(chr)
hashval = hashval % 998442353
return hashval
urlhead = str()
readed_urls = set()
cnt_pages = 0
cnt_saved_pages = 0
def getPage(url):
global cnt_pages
global cnt_saved_pages
global readed_urls
global urlhead
cnt_pages+=1
if not checkUrl(url):
return
if hashString(url) in readed_urls:
return
readed_urls.add(hashString(url))
print('at url: \"'+url+'\"')
if url[0:len(urlhead)] != urlhead[:]:
print('out of range, crawing cancelled')
return
try:
connect = urllib.request.urlopen(url, None, 10000)
htmldoc = connect.read().decode(u'utf-8', 'ignore');
tosave = open(translateUrl(url), 'w', -1, 'utf-8')
tosave.write(htmldoc)
tosave.close()
cnt_saved_pages+=1
print('url: "'+url+'" saved.')
matchobj = re.compile('href="(.*?)"')
urllist = re.findall(matchobj, htmldoc)
for nexturl in urllist:
getPage(nexturl)
except urllib.error.URLError:
print('failed at url: \"'+url+'\"')
return
return
urlhead = input("Please input wpsite url:")
cnt_pages = 0
readed_urls.clear()
getPage(urlhead)
print('Crawing finished.')
print(str(cnt_pages)+' visits. '+str(cnt_saved_pages)+' pages saved.')
Join the discussion