最终得对抗自己

[技术]简易Python博客爬虫

前言

由于害怕博客再次爆炸 , 顺便学习Python, 决定试着写一个博客爬虫。
以后定时爬博客进行备份。

代码

确实没什么好写的…Python非常简单, 搞搞就好了。
主要原理就是下载html到本地, 然后顺着链接dfs过去, 判断是否在站内+解码之类就行了。
代码:

import urllib.request
import re

suffix_allowed = ('.jpg', '.jpeg', '.png', '.ppt', '.doc', '.docx', '.pdf', '.gif', '.wav', '.ico', '.mp3', '.mp4', '.txt', '.pptx', '.bmp', '.tiff', '.ini', '.inf', '.log', '.html')

def checkUrl(url):
    if url.count('?') == 0 and url.count('#') == 0:
        return True
    return False

def translateUrl(url):
    global suffix_allowed
    if url[0:7] == 'http://':
        url = url[7:len(url)]
    url = url.replace('/', '.')
    flag = True
    for suf in suffix_allowed:
        if url.endswith(suf):
            flag = False
            break
    if url.endswith('feed'):
        url += '.xml'
    elif flag:
        url += '.html'
    return url

def hashString(s):
    hashval = 0
    for chr in s:
        hashval = hashval*97+ord(chr)
        hashval = hashval % 998442353
    return hashval

urlhead = str()
readed_urls = set()
cnt_pages = 0
cnt_saved_pages = 0

def getPage(url):
    global cnt_pages
    global cnt_saved_pages
    global readed_urls
    global urlhead
    cnt_pages+=1
    if not checkUrl(url):
        return
    if hashString(url) in readed_urls:
        return 
    readed_urls.add(hashString(url))
    print('at url: \"'+url+'\"')
    if url[0:len(urlhead)] != urlhead[:]:
        print('out of range, crawing cancelled')
        return 
    try:
        connect = urllib.request.urlopen(url, None, 10000)
        htmldoc = connect.read().decode(u'utf-8', 'ignore');
        tosave = open(translateUrl(url), 'w', -1, 'utf-8')
        tosave.write(htmldoc)
        tosave.close()
        cnt_saved_pages+=1
        print('url: "'+url+'" saved.')
        matchobj = re.compile('href="(.*?)"')
        urllist = re.findall(matchobj, htmldoc)
        for nexturl in urllist:
            getPage(nexturl)
    except urllib.error.URLError:
        print('failed at url: \"'+url+'\"')
        return
    return


urlhead = input("Please input wpsite url:")
cnt_pages = 0
readed_urls.clear()
getPage(urlhead)
print('Crawing finished.')
print(str(cnt_pages)+' visits. '+str(cnt_saved_pages)+' pages saved.')

Join the discussion

Your email address will not be published. Required fields are marked *