爬虫多线程模板,xpath,etree
class QuiShi:
def __init__(self):
self.temp_url = "http://www.lovehhy.net/Joke/Detail/QSBK/{0}"
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"}
#1.Queue url队列
self.url_query = Queue()
# html网页队列
self.html_query = Queue()
# content内容队列
self.content_query = Queue()
def get_url_list(self):
for i in range(1,5):
self.url_query.put(self.temp_url.format(i))
def parse_url(self):
while True:
url = self.url_query.get()
self.html_query.put(requests.get(url,headers=self.headers).content.decode("gbk"))
self.url_query.task_done()
def get_content_list(self):
# print(html_str)
#etree.HTML 变成树状结构
while True:
html_str = self.html_query.get()
html_str = html_str.replace("<br />","").strip("")
html = etree.HTML(html_str)
# s = html.xpath(\'//div[@id="footzoon"]\')
h3_list = html.xpath(\'//div[@id="footzoon"]/h3\')
content_list=[]
for h3 in h3_list:
item = {}
item["title"] = h3.xpath("./a/text()")
item["title_href"] = h3.xpath("./a/@href")
item["content"] =[]
s = h3.xpath(\'./following-sibling::div/text()\')
for i in s:
item["content"].append(i.replace("\u3000",""))
content_list.append(item)
self.content_query.put(content_list)
self.html_query.task_done()
def save_content_list(self):
while True:
cons = self.content_query.get()
print(cons)
self.content_query.task_done()
def run(self):
# 1.获取url地址列表
t1 = threading.Thread(target=self.get_url_list)
t21 = threading.Thread(target=self.parse_url)
t22 = threading.Thread(target=self.parse_url)
t23 = threading.Thread(target=self.parse_url)
t3 = threading.Thread(target=self.get_content_list)
t4 = threading.Thread(target=self.save_content_list)
t1.start()
t21.start()
t22.start()
t23.start()
t3.start()
t4.start()
self.url_query.join()
self.html_query.join()
self.content_query.join()
if __name__ == \'__main__\':
t1 = time.time()
quishi = QuiShi()
quishi.run()
print(time.time() - t1)
版权声明:本文为dreamer-zhang原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。