热门标签 | HotTags
当前位置:  开发笔记 > 编程语言 > 正文

爬取爱笔智能招聘职位

爬取爱笔智能招聘职位http:aibee.comcnjoinus.aspx1importrequests2fromurllib.parseimporturlencode3fromp

爬取爱笔智能招聘职位

http://aibee.com/cn/joinus.aspx 

1 import requests
2 from urllib.parse import urlencode
3 from pyquery import PyQuery as pq
4 from pymongo import MongoClient
5 import json
6
7
8 base_url = 'http://aibee.com/cn/joinus.aspx?action=jobinfo&'
9
10 headers = {
11 'Host': 'aibee.com',
12 'Referer': 'http://aibee.com/cn/joinus.aspx',
13 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
14 'X-Requested-With': 'XMLHttpRequest',
15 }
16
17 client = MongoClient()
18 db = client['aibee']
19 collection = db['aibee']
20 max_id = 50
21
22
23
24 def get_page(id):
25
26 formData = {
27 'id': id,
28 }
29
30
31 data = urlencode(formData)
32 url = base_url + urlencode(formData)
33 try:
34 response = requests.get(url, data=data, headers=headers)
35 if response.status_code == 200:
36
37 return response.json()
38 except requests.ConnectionError as e:
39 print('Error', e.args)
40
41
42 def parse_page(json_1):
43 if json_1:
44 items = json_1.get('shuzu')
45 for item in items:
46 if id == 1 :
47 continue
48 else:
49
50 aibee = {}
51 aibee['id'] = item.get('id')
52 aibee['title'] = item.get('title')
53 aibee['zhize'] = pq(item.get('zhize')).text()
54 aibee['yaoqiu'] = pq(item.get('yaoqiu')).text()
55 aibee['dtt'] = item.get('dtt')
56 aibee['emailaddr'] = item.get('emailaddr')
57 yield aibee
58
59
60 def write_to_file(content):
61 with open('aibee.json','a',encoding='utf-8') as f:
62 f.write(json.dumps(content,ensure_ascii=False)+'\n')
63 f.close()
64
65 def save_to_mongo(result):
66 if collection.insert(result):
67 print('Saved to Mongo')
68
69
70 if __name__ == '__main__':
71 for id in range(1, max_id + 1):
72 json_1 = get_page(id)
73 #print(json_1)
74
75 results = parse_page(json_1)
76 for result in results:
77 print(result)
78 write_to_file(result)
79 save_to_mongo(result)

 或者:

1 import requests
2 from urllib.parse import urlencode
3 from pyquery import PyQuery as pq
4 from pymongo import MongoClient
5 import json
6
7
8 url = 'http://aibee.com/cn/joinus.aspx?action=jobinfo'
9
10 headers = {
11 'Host': 'aibee.com',
12 'Referer': 'http://aibee.com/cn/joinus.aspx',
13 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
14 'X-Requested-With': 'XMLHttpRequest',
15 }
16
17 client = MongoClient()
18 db = client['aibee']
19 collection = db['aibee']
20 max_id = 50
21
22
23
24 def get_page(id):
25
26 formData = {
27 'id': id,
28 }
29 try:
30 r = requests.post(url, data=formData, headers=headers)
31 if r.status_code == 200:
32 return r.json()
33 except requests.ConnectionError as e:
34 print('Error', e.args)
35
36
37 def parse_page(json_1):
38 if json_1:
39 items = json_1.get('shuzu')
40 for item in items:
41 if id == 1 :
42 continue
43 else:
44
45 aibee = {}
46 aibee['id'] = item.get('id')
47 aibee['title'] = item.get('title')
48 aibee['zhize'] = pq(item.get('zhize')).text()
49 aibee['yaoqiu'] = pq(item.get('yaoqiu')).text()
50 aibee['dtt'] = item.get('dtt')
51 aibee['emailaddr'] = item.get('emailaddr')
52 yield aibee
53
54
55 def write_to_file(content):
56 with open('aibee.json','a',encoding='utf-8') as f:
57 f.write(json.dumps(content,ensure_ascii=False)+'\n')
58 f.close()
59
60 def save_to_mongo(result):
61 if collection.insert(result):
62 print('Saved to Mongo')
63
64
65 if __name__ == '__main__':
66 for id in range(1, max_id + 1):
67 json_1 = get_page(id)
68 #print(json_1)
69
70 results = parse_page(json_1)
71 for result in results:
72 print(result)
73 write_to_file(result)
74 save_to_mongo(result)

 


转载于:https://www.cnblogs.com/wanglinjie/p/9226880.html


推荐阅读
author-avatar
dmcm0001
这个家伙很懒,什么也没留下!
PHP1.CN | 中国最专业的PHP中文社区 | DevBox开发工具箱 | json解析格式化 |PHP资讯 | PHP教程 | 数据库技术 | 服务器技术 | 前端开发技术 | PHP框架 | 开发工具 | 在线工具
Copyright © 1998 - 2020 PHP1.CN. All Rights Reserved | 京公网安备 11010802041100号 | 京ICP备19059560号-4 | PHP1.CN 第一PHP社区 版权所有