这是一段多线程人练习代码,可以采集到大部分人数据
但是总是不知执行到“All done ”这里
因为采集到最后,可能会碰到图片地址为空的时候,就会发生错误,
怎么改进一下,让他容错,然后执行完成?
#!/usr/bin/env python # -*- coding: utf-8 -*- # @File : download.py # @Software: PyCharm import os,json,sys,getopt import time import Queue,threading from bs4 import BeautifulSoup import requests try: opts, args = getopt.getopt(sys.argv[1:], 'd:') except getopt.GetoptError, err: print str(err) exit() BASE_DIR = 'xieemanhua' for k,v in opts: if k == '-d': BASE_DIR = v BASE_URL = 'http://m.wujiecao.cn' SAVE_DIR = 'datas/'+BASE_DIR THREAD_COUNT = 5 #获取HTML内容 def getHtml(url,timeout=20): try: headers = { 'Accept-Language': 'zh-cn', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/4.0 (compatible MSIE 6.00 Windows NT 5.1 SV1)', } r = requests.get(url,headers=headers,timeout=timeout) r.encoding='utf-8' html = r.text return html except Exception,ex: return False #采集列表页 def getAllPageLists(): print "=====start get all page lists====== %s" % time.ctime() soup = BeautifulSoup(getHtml(BASE_URL+'/'+BASE_DIR)) select = soup.find('select', class_='paging-select') option = select.find_all('option') lists = [] for o in option: lists.append(BASE_URL+'/'+BASE_DIR+'/'+o['value']) return lists #采集每一页的列表 def getSingePageUrlLists(url): print "=====start getSingePageUrlLists("+os.path.basename(url)+") ====== %s" % time.ctime() soup = BeautifulSoup(getHtml(url)) ul=soup.find("ul", class_="pic") all_a=ul.find_all('a') datas = [] for a in all_a: img = a.find('img') span=a.find('span',class_="bt") title = span.contents[0] data = {'url':BASE_URL+a['href'],'title':title,'pic':BASE_URL+img['lazysrc']} datas.append(data) return datas #采集详细页 def getDetailPage(url,title): print "=====start getDetailPage("+os.path.basename(url)+") ====== %s" % time.ctime() soup = BeautifulSoup(getHtml(url)) p = soup.find('p',id="imgString") img = p.find('img') imgUrl = img['src'] if imgUrl == '': return False response = requests.get(imgUrl, stream=True) if response.status_code != 200: return False image = response.content dir = SAVE_DIR if not os.path.exists(dir): os.mkdir(dir) baseName = os.path.basename(img['src']); fileName = dir+'/'+title+'.'+baseName.split('.')[1] try: open(fileName ,"wb").write(image) print "=====write end====== %s" % time.ctime() except IOError: print("IO Error\n") return class getLists(threading.Thread): def __init__(self ,que,detailQue): threading.Thread.__init__(self) self.pageQue = que self.detailQue = detailQue def run(self): while True: url = self.pageQue.get() singePageLists = getSingePageUrlLists(url) for sl in singePageLists: self.detailQue.put(json.dumps(sl)) self.pageQue.task_done() class getDetailLists(threading.Thread): def __init__(self ,detailQue): threading.Thread.__init__(self) self.detailQue = detailQue def run(self): while True: data = self.detailQue.get() decodeData = json.loads(data) getDetailPage(decodeData['url'],decodeData['title']) self.detailQue.task_done() pageQue = Queue.Queue() detailQue = Queue.Queue() if __name__ == '__main__': print "====start request====%s" % time.ctime() allLists = getAllPageLists() for u in allLists: pageQue.put(u) for t in range(THREAD_COUNT): t = getLists(pageQue,detailQue) t.setDaemon(True) t.start() for t in range(THREAD_COUNT): t = getDetailLists(detailQue) t.setDaemon(True) t.start() pageQue.join() detailQue.join() print 'all DONE at:', time.ctime()