基于pydpier爬取1药网(转载)

作者：赢在青春创业团队 | 来源：互联网 | 2023-06-03 12:09

1.商品爬取#!usrbinenvpython#-*-encoding:utf-8-*-#Createdon2019-02-0208:59:40#Project:oneDru

1.商品爬取

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-02-02 08:59:40
# Project: oneDrugfrom pyspider.libs.base_handler import *
from pymongo import MongoClient
import reclass Handler(BaseHandler):crawl_config &＃61; {}def __init__(self):self.client &＃61; MongoClient(&＃39;mongodb://localhost:27017&＃39;)self.drug &＃61; self.client.drugdef insert_goods(self, data):collection &＃61; self.drug[&＃39;goods&＃39;]collection.update({&＃39;goods_id&＃39;: data[&＃39;goods_id&＃39;]}, data, True)def insert_comments(self, data):collection &＃61; self.drug[&＃39;comments&＃39;]collection.insert_one(data)&＃64;every(minutes&＃61;24 * 60)def on_start(self):self.crawl(&＃39;https://www.111.com.cn/categories/&＃39;, callback&＃61;self.categories_page, validate_cert&＃61;False,fetch_type&＃61;&＃39;js&＃39;)&＃64;config(age&＃61;10 * 24 * 60 * 60)def categories_page(self, response):for each in response.doc(&＃39;.allsort em > a&＃39;).items():self.crawl(each.attr.href, callback&＃61;self.cagetory_list_page, validate_cert&＃61;False, fetch_type&＃61;&＃39;js&＃39;)&＃64;config(priority&＃61;1)def cagetory_list_page(self, response):for each in response.doc(&＃39;#itemSearchList a[target&＃61;"_blank"][class&＃61;"product_pic pro_img"]&＃39;).items():self.crawl(each.attr.href, callback&＃61;self.detail_page, validate_cert&＃61;False, fetch_type&＃61;&＃39;js&＃39;)next &＃61; response.doc(&＃39;#search_table > div.turnPageBottom > a.page_next&＃39;).attr.hrefself.crawl(next, callback&＃61;self.cagetory_list_page, validate_cert&＃61;False, fetch_type&＃61;&＃39;js&＃39;)&＃64;config(priority&＃61;2)def detail_page(self, response):goods_id &＃61; response.doc(&＃39;#gallery_view > ul > li.item_number&＃39;).text()cagetory_one &＃61; response.doc(&＃39;body > div.wrap.clearfix > div > span:nth-child(3) > a&＃39;).text()cagetory_two &＃61; response.doc(&＃39;body > div.wrap.clearfix > div > span:nth-child(5) > a&＃39;).text()cagetory_three &＃61; response.doc(&＃39;body > div.wrap.clearfix > div > span:nth-child(7) > a&＃39;).text()merchants &＃61; response.doc(&＃39;div.middle_property > span:nth-child(1)&＃39;).text()goods_name &＃61; response.doc(&＃39;div.middle_property > h1&＃39;).text()goods_desc &＃61; response.doc(&＃39;div.middle_property > span.red.giftRed&＃39;).text()goods_price &＃61; response.doc(&＃39;div.middle_property > div.shangpin_info > dl:nth-child(2) > dd > span.good_price&＃39;).text()total_comments &＃61; response.doc(&＃39;#fristReviewCount > span > a&＃39;).text()brand &＃61; response.doc(&＃39;#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(2)&＃39;).text()spec &＃61; response.doc(&＃39;#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(4)&＃39;).text()weight &＃61; response.doc(&＃39;#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(2)&＃39;).text()manufacturers &＃61; response.doc(&＃39;#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(4)&＃39;).text()approval_number &＃61; response.doc(&＃39;#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(2)&＃39;).text()drug_type &＃61; response.doc(&＃39;#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(4)&＃39;).text()instructions &＃61; {}if response.doc(&＃39;#prodDetailCotentDiv > table > tbody > tr:nth-child(1) > th&＃39;).text():for i in range(3, 22):instructions_key &＃61; \response.doc(&＃39;#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > th&＃39;.format(i)).text().split(" ")[0]instructions_value &＃61; response.doc(&＃39;#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > td&＃39;.format(i)).text()instructions[instructions_key] &＃61; instructions_valuetotal_comments &＃61; response.doc(&＃39;#itemComments > span&＃39;).text()good_comments &＃61; response.doc(&＃39;#productExperience > div > ul > li:nth-child(2) > a > span&＃39;).text()mid_comments &＃61; response.doc(&＃39;#productExperience > div > ul > li:nth-child(3) > a > span&＃39;).text()bad_comments &＃61; response.doc(&＃39;#productExperience > div > ul > li:nth-child(4) > a > span&＃39;).text()url_id &＃61; re.findall(&＃39;\d&＃43;&＃39;, response.url)[1]goods_data &＃61; {&＃39;url_id&＃39;: url_id,&＃39;goods_id&＃39;: goods_id,&＃39;goods_name&＃39;: goods_name,&＃39;goods_desc&＃39;: goods_desc,&＃39;goods_price&＃39;: goods_price,&＃39;merchants&＃39;: merchants,&＃39;cagetory&＃39;: {&＃39;1&＃39;: cagetory_one,&＃39;2&＃39;: cagetory_two,&＃39;3&＃39;: cagetory_three},&＃39;drug_detail&＃39;: {&＃39;brand&＃39;: brand,&＃39;spec&＃39;: spec,&＃39;weight&＃39;: weight,&＃39;manufacturers&＃39;: manufacturers,&＃39;approval_number&＃39;: approval_number,&＃39;drug_type&＃39;: drug_type},&＃39;instructions&＃39;: instructions,&＃39;comments&＃39;: {&＃39;total_comments&＃39;: total_comments,&＃39;good_comments&＃39;: good_comments,&＃39;mid_comments&＃39;: mid_comments,&＃39;bad_comments&＃39;: bad_comments}}self.insert_goods(goods_data)

2.评论爬取

from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
import re
import socketclass Drug:def __init__(self):self.clint &＃61; MongoClient(&＃39;mongodb://localhost:27017&＃39;)self.drug &＃61; self.clint.drugself.collection &＃61; self.drug[&＃39;goods&＃39;]self.comm_collection &＃61; self.drug[&＃39;comments&＃39;]def dbmodify(self):for data in self.collection.find({},{"goods_id":1,"goods_price":1}):try:_id &＃61; data[&＃39;_id&＃39;]id &＃61; data[&＃39;goods_id&＃39;].split("&＃xff1a;")[1]price &＃61; data[&＃39;goods_price&＃39;].split("&＃xffe5;")[1]self.collection.update({&＃39;_id&＃39;: _id},{&＃39;$set&＃39;:{&＃39;goods_id&＃39;:id,&＃39;goods_price&＃39;:price}})print(_id, id, price)except IndexError:passdef getBaseArgument(self,goods_id):base_url &＃61; &＃39;https://www.111.com.cn/interfaces/review/list/html.action&＃39;data &＃61; {&＃39;goodsId&＃39;: goods_id,&＃39;pageIndex&＃39;: 1,&＃39;score&＃39;: &＃39;1&_19020301&＃39;}try:self.collection.update_one({&＃39;url_id&＃39;: goods_id}, {&＃39;$set&＃39;: {&＃39;commspider&＃39;: True}})requests.packages.urllib3.disable_warnings()requests.adapters.DEFAULT_RETRIES &＃61; 5# 设置连接活跃状态为Falses &＃61; requests.session()s.keep_alive &＃61; Falser &＃61; s.get(base_url, params&＃61;data, timeout &＃61; 5,verify&＃61;False)r.close()soup &＃61; BeautifulSoup(r.text, &＃39;html.parser&＃39;)if soup.find_all("div", class_&＃61;"view_no_result"):return "No Comments!"else:total_page_text &＃61; soup.find_all(text&＃61;re.compile(r&＃39;共\d&＃43;页&＃39;))[0]pattern &＃61; re.compile(r&＃39;\d&＃43;&＃39;)total_page &＃61; pattern.findall(total_page_text)return total_page[0]except requests.exceptions.RequestException as e:print(e)def getCommlist(self,goods_id, total_page):base_url &＃61; &＃39;https://www.111.com.cn/interfaces/review/list/html.action&＃39;try:for i in range(1, int(total_page)):data &＃61; {&＃39;goodsId&＃39;: goods_id,&＃39;pageIndex&＃39;: i,&＃39;score&＃39;: &＃39;1&_19020301&＃39;}try:requests.packages.urllib3.disable_warnings()requests.adapters.DEFAULT_RETRIES &＃61; 15# 设置连接活跃状态为Falses &＃61; requests.session()s.keep_alive &＃61; Falser &＃61; s.get(base_url, params&＃61;data, timeout &＃61; 5,verify&＃61;False)r.close()soup &＃61; BeautifulSoup(r.text, &＃39;html.parser&＃39;)for tr in soup.find_all("tr"):comments &＃61; {}try:comments[&＃39;goodsId&＃39;] &＃61; goods_idcomments[&＃39;content&＃39;] &＃61; tr.find(&＃39;p&＃39;).text.strip()comments[&＃39;date&＃39;] &＃61; tr.find(&＃39;p&＃39;, attrs&＃61;{&＃39;class&＃39;: &＃39;eval_date&＃39;}).text.strip()self.comm_collection.insert_one(comments)except:print(goods_id &＃43; "Have some problem!\n")print(comments)except requests.exceptions.RequestException as e:print(e)except ValueError:return "No Comments! Try next!"def getComments(self):i &＃61; 0goods_list &＃61; []for data in self.collection.find({&＃39;commspider&＃39;: False}, {"url_id"}):id &＃61; data[&＃39;url_id&＃39;]goods_list.append(id)length &＃61; len(goods_list)print("总共 {} 条商品".format(length))for good in goods_list:total_page &＃61; self.getBaseArgument(good)comments &＃61; self.getCommlist(good,total_page)i &＃61; i &＃43; 1print("总共 {} 条商品\n目前第 {} 条\n商品编号 {} \n".format(length,i, good))print(comments)test &＃61; Drug().getComments()

转:https://www.cnblogs.com/tjp40922/p/10611624.html

推荐阅读

less
微软头条实习生分享深度学习自学指南

本文介绍了一位微软头条实习生自学深度学习的经验分享，包括学习资源推荐、重要基础知识的学习要点等。作者强调了学好Python和数学基础的重要性，并提供了一些建议。 ... [详细]

蜡笔小新 2023-12-14 20:58:32
include
vue使用

关键词： ... [详细]

蜡笔小新 2023-12-14 19:14:56
config
安装mysqlclient失败解决办法

本文介绍了在MAC系统中，使用django使用mysql数据库报错的解决办法。通过源码安装mysqlclient或将mysql_config添加到系统环境变量中，可以解决安装mysqlclient失败的问题。同时，还介绍了查看mysql安装路径和使配置文件生效的方法。 ... [详细]

蜡笔小新 2023-12-14 18:24:10
config
单击后为什么远程通知操作无效？ - Why remote notification action is doing nothing after clicking?

IhaveconfiguredanactionforaremotenotificationwhenitarrivestomyiOsapp.Iwanttwodiff ... [详细]

蜡笔小新 2023-12-14 15:57:44
schema
的错误消息：

ZSI.generate.Wsdl2PythonError: unsupported local simpleType restriction ... [详细]

蜡笔小新 2023-12-13 20:28:08
schema
推荐系统遇上深度学习(十七）详解推荐系统中的常用评测指标

原创：石晓文小小挖掘机2018-06-18笔者是一个痴迷于挖掘数据中的价值的学习人，希望在平日的工作学习中，挖掘数据的价值， ... [详细]

蜡笔小新 2023-12-13 19:35:25
config
页面请求方法参数最长_关于 HTTP GET/POST 请求参数长度最大值的一个理解误区

http:my.oschina.netleejun2005blog136820刚看到群里又有同学在说HTTP协议下的Get请求参数长度是有大小限制的，最大不能超过XX ... [详细]

蜡笔小新 2023-12-13 19:20:03
get
sklearn数据集库中的常用数据集类型介绍

本文介绍了sklearn数据集库中常用的数据集类型，包括玩具数据集和样本生成器。其中详细介绍了波士顿房价数据集，包含了波士顿506处房屋的13种不同特征以及房屋价格，适用于回归任务。 ... [详细]

蜡笔小新 2023-12-13 17:45:15
get
Golang如何使用Cookie跟踪位置

关键词：Golang, Cookie, 跟踪位置, net/http/cookiejar, package main, golang.org/x/net/publicsuffix, io/ioutil, log, net/http, net/http/cookiejar ... [详细]

蜡笔小新 2023-12-13 15:47:22
config
在重复造轮子的情况下用ProxyServlet反向代理来减少工作量

像不少公司内部不同团队都会自己研发自己工具产品，当各个产品逐渐成熟，到达了一定的发展瓶颈，同时每个产品都有着自己的入口，用户 ... [详细]

蜡笔小新 2023-12-13 15:19:01
config
r2dbc配置多数据源

R2dbc配置多数据源问题根据官网配置r2dbc连接mysql多数据源所遇到的问题pom配置可以参考官网,不过我这样配置会报错我并没有这样配置将以下内容添加到pom.xml文件d ... [详细]

蜡笔小新 2023-12-12 16:38:53
get
解决nginx启动报错epoll_wait() reported that client prematurely closed connection的方法

本文介绍了解决nginx启动报错epoll_wait() reported that client prematurely closed connection的方法，包括检查location配置是否正确、pass_proxy是否需要加“/”等。同时，还介绍了修改nginx的error.log日志级别为debug，以便查看详细日志信息。 ... [详细]

蜡笔小新 2023-12-12 13:19:04
command
利用Visual Basic开发SAP接口程序初探的方法与原理

本文介绍了利用Visual Basic开发SAP接口程序的方法与原理，以及SAP R/3系统的特点和二次开发平台ABAP的使用。通过程序接口自动读取SAP R/3的数据表或视图，在外部进行处理和利用水晶报表等工具生成符合中国人习惯的报表样式。具体介绍了RFC调用的原理和模型，并强调本文主要不讨论SAP R/3函数的开发，而是针对使用SAP的公司的非ABAP开发人员提供了初步的接口程序开发指导。 ... [详细]

蜡笔小新 2023-12-13 10:56:31
config
如何清除Eclipse中SVN用户的设置

本文介绍了如何清除Eclipse中SVN用户的设置。首先需要查看使用的SVN接口，然后根据接口类型找到相应的目录并删除相关文件。最后使用SVN更新或提交来应用更改。 ... [详细]

蜡笔小新 2023-12-12 14:42:31
config
Windows7 64位系统安装PLSQL Developer的步骤和注意事项

本文介绍了在Windows7 64位系统上安装PLSQL Developer的步骤和注意事项。首先下载并安装PLSQL Developer，注意不要安装在默认目录下。然后下载Windows 32位的oracle instant client，并解压到指定路径。最后，按照自己的喜好对解压后的文件进行命名和压缩。 ... [详细]

蜡笔小新 2023-12-12 13:32:08

赢在青春创业团队

这个家伙很懒，什么也没留下！

Tags | 热门标签

RankList | 热门文章