网上看到的教程,但是是用正则表达式写的,并不能运行,后面我就用xpath改了,然后重新写了逻辑,并且使用了双线程,也算是原创了吧
#!/usr/bin/python
# -*- encoding:utf-8 -*-


from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import sys
#编码
reload(sys)
sys.setdefaultencoding(\'utf-8\')

#定义输出函数
def towrite(contentdict):
f.writelines(u\'作者:\' + contentdict[\'author\'] + \'\n\')
f.writelines(u\'内容:\' + contentdict[\'content\'] + \'\n\')
f.writelines(u\'好笑:\' + contentdict[\'vote\'] + \'\n\')
f.writelines(u\'评论:\' + contentdict[\'span\'] + \'\n\n\')

def spider(url):
#得到页面代码
user_agent = \'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)\'
headers = { \'User-Agent\' : user_agent }
html = requests.get(url,headers=headers)

#获取内容
selector = etree.HTML(html.text)
content_field = selector.xpath(\'//*[@id="content-left"]/div[@class="article block untagged mb15"]\')
item={}
for i in range(len(content_field)):
#作者
author_f= content_field[i].xpath(\'div[@class="author clearfix"]\')[0]
author=author_f.xpath(\'string(.)\').replace(\'\n\',\'\').replace(\' \',\'\')
#内容
content_f=content_field[i].xpath(\'div[@class="content"]/text()\')
content=\'\'
for n in range(len(content_f)):
content_temp=content_f[n].replace(\'\n\',\'\').replace(\' \',\'\').replace(\'\t\',\'\')
content+=str(content_temp)

#好笑
vote=\'\'
vote_temp= content_field[i].xpath(\'div[@class="stats"]/span[@class="stats-vote"]/i/text()\')[0]
vote+=str(vote_temp)

#评论,如果评论为空,则不会显示i节点
span=\'\'
span_temp_l= content_field[i].xpath(\'div[@class="stats"]/span[@class="stats-comments"]/a/i/text()\')
span_temp=[]
if len(span_temp_l)>0:
span_temp=span_temp_l[0]
else:
span_temp=\'0\'
span+=str(span_temp)

item[\'author\'] = author
item[\'content\'] = content
item[\'vote\'] = vote
item[\'span\'] = span

towrite(item)


if __name__ == \'__main__\':
pool = ThreadPool(4)
f = open(\'content.txt\',\'a\')
url = []
for i in range(1,36):
newpage = \'http://www.qiushibaike.com/hot/page/\' + str(i)
url.append(newpage)

results = pool.map(spider, url)
pool.close()
pool.join()
f.close()

版权声明:本文为miranda-tang原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/miranda-tang/p/5508368.html