#-*- coding: utf-8 -*-
importrequestsimportreimportsys
reload(sys)
sys.setdefaultencoding('utf-8')classSpider(object):def __init__(self):print('开始爬取豆瓣图书top250的内容。。。。。。')#传入url,返回网页源代码
defgetSourceCode(self, url):
html=requests.get(url)returnhtml.text#从源代码中提取出我们需要的内容块:{书名、作者出版社等、评分、评价人数}。
defgetEveryBookContent(self, sourceCode):
everyBookContent= re.findall('
', sourceCode, re.S)#everyBookContent = re.findall('
(.*?)
(.*?)
(.*?)
', sourceCode, re.S)
returneveryBookContent#从内容块中提取出数据
defgetBookInfo(self, eachBookContent):
bookInfo={}#bookInfo['title'] = re.subn('( |\n|
|?span.*?>)', "", re.search('(.*?)', eachBookContent, re.S).group(1))[0]
bookInfo['title'] = re.sub('( |\n|
|?span.*?>)', "", re.search('(.*?)', eachBookContent, re.S).group(1))
bookInfo['author'] = re.search('
(.*?)
', eachBookContent, re.S).group(1)
bookInfo['discussNum'] = re.sub('( |\n|
)', "", re.search('\((.*?)\)', eachBookContent, re.S).group(1))
bookInfo['score'] = re.search('(.*?)', eachBookContent, re.S).group(1)returnbookInfo#将结果保存到文件
defsaveBookInfo(self, bookList):
f= open("bookList.txt", "a")for each inbookList:
f.writelines('书 名:\t {}\n'.format(each['title']))
f.writelines('作 者:\t {}\n'.format(each['author']))
f.writelines('评论数:\t {}\n'.format(each['discussNum']))
f.writelines('评 分:\t {}\n\n'.format(each['score']))
f.close()defstart(self, url):
sourceCode=self.getSourceCode(url)
everyBookContent=self.getEveryBookContent(sourceCode)
bookList=[]for each ineveryBookContent:
bookList.append(self.getBookInfo(each))
self.saveBookInfo(bookList)if __name__ == '__main__':
douban=Spider()
url&#61; &#39;http://book.douban.com/top250?start&#61;0&#39;i&#61;0while i <&#61; 225:
url&#61; &#39;http://book.douban.com/top250?start&#61;{}&#39;.format(i)
douban.start(url)
i&#43;&#61; 25