我在试着爬取CNKI的这个表格的部分字段,然后这个表格的作者列中,有些是含有多个名字,所以就是多条标签。然后我抓取下来的名称和作者就对不上,多个作者的标签会换行,那么我应该如何做才能写入csv时完成这种一条文献对应多个作者名字。
Python2
# coding:utf-8 import re import csv import codecs from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.keys import Keys import time import warnings import sys reload(sys) sys.setdefaultencoding('utf-8') warnings.filterwarnings("ignore") driver=webdriver.Firefox() driver.get("http://epub.cnki.net/kns/brief/result.aspx?dbprefix=scdb&action=scdbsearch&db_opt=SCDB") driver.find_element_by_link_text(u'专业检索').click() time.sleep(3) driver.find_element_by_css_selector("textarea[id=\"expertvalue\"]").clear() driver.find_element_by_css_selector("textarea[id=\"expertvalue\"]").send_keys(u"TI='生态'") driver.find_element_by_id("btnSearch").click() driver.switch_to_frame("iframeResult") page=driver.page_source note=BeautifulSoup(page) f = open('tabletable.csv', 'wb') f.write(codecs.BOM_UTF8) urlitems=note.findAll(name="a", attrs={"class":"fz14"}) nameitems=note.findAll(name="a", attrs={"target":"knet"}) textitems=note.findAll(name="a", attrs={"class":"KnowledgeNetLink","target":"_blank"}) header=['url','name','writer','choose'] writer = csv.writer(f,delimiter=',') writer.writerow(header) csvrow1=[] csvrow2=[] csvrow3=[] csvrow4=[] for urlinks in urlitems: csvrow1.append('http://www.cnki.net' + urlinks.get('href')) for names in urlitems: csvrow2.append(names.get_text()) for writers in nameitems: csvrow3.append(writers.get_text()) for chooses in textitems: csvrow4.append(chooses.get_text()) for i in zip(csvrow1, csvrow2, csvrow3, csvrow4): f.write(i[0] + ',' + i[1] + ',' + i[2] + ',' + i[3] + '\n') f.close()
感觉你是在瞎整,你要把取到的数据弄成这样:
[ {'url': 'x', 'name': 'a'}, {'url': 'x', 'name': '李海舰; 田跃新; 李文杰'} ]