《爬虫学习》(六)(爬取58同城)
1.获取大页面下各个分类的小URL合集
from bs4 import BeautifulSoup import requests start_url = \'http://bj.58.com/sale.shtml\' url_host = \'http://bj.58.com\' def get_index_url(url): # url = start_url wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, \'lxml\') links = soup.select(\'ul.ym-submnu > li > b > a\') for link in links: page_url = url_host + link.get(\'href\') print(page_url) get_index_url(start_url) channel_list = \'\'\' http://bj.58.com/shouji/ http://bj.58.com/shoujihao/ http://bj.58.com/tongxunyw/ http://bj.58.com/diannao/ http://bj.58.com/bijiben/ http://bj.58.com/pbdn/ http://bj.58.com/diannaopeijian/ http://bj.58.com/zhoubianshebei/ http://bj.58.com/shuma/ http://bj.58.com/shumaxiangji/ http://bj.58.com/mpsanmpsi/ http://bj.58.com/youxiji/ http://bj.58.com/jiadian/ http://bj.58.com/dianshiji/ http://bj.58.com/ershoukongtiao/ http://bj.58.com/xiyiji/ http://bj.58.com/bingxiang/ http://bj.58.com/binggui/ http://bj.58.com/chuang/ http://bj.58.com/ershoujiaju/ http://bj.58.com/yingyou/ http://bj.58.com/yingeryongpin/ http://bj.58.com/muyingweiyang/ http://bj.58.com/muyingtongchuang/ http://bj.58.com/yunfuyongpin/ http://bj.58.com/fushi/ http://bj.58.com/nanzhuang/ http://bj.58.com/fsxiemao/ http://bj.58.com/xiangbao/ http://bj.58.com/meirong/ http://bj.58.com/yishu/ http://bj.58.com/shufahuihua/ http://bj.58.com/zhubaoshipin/ http://bj.58.com/yuqi/ http://bj.58.com/tushu/ http://bj.58.com/tushubook/ http://bj.58.com/wenti/ http://bj.58.com/yundongfushi/ http://bj.58.com/jianshenqixie/ http://bj.58.com/huju/ http://bj.58.com/qiulei/ http://bj.58.com/yueqi/ http://bj.58.com/bangongshebei/ http://bj.58.com/diannaohaocai/ http://bj.58.com/bangongjiaju/ http://bj.58.com/ershoushebei/ http://bj.58.com/danche/ http://bj.58.com/fzixingche/ http://bj.58.com/diandongche/ http://bj.58.com/sanlunche/ http://bj.58.com/peijianzhuangbei/ http://bj.58.com/tiaozao/ \'\'\'
2.针对每一个小URL进行信息提取
from bs4 import BeautifulSoup import requests import time import pymongo client = pymongo.MongoClient(\'localhost\', 27017) ceshi = client[\'ceshi\'] url_list = ceshi[\'url_list4\'] item_info = ceshi[\'item_info4\'] # 在最左边是在python 中对象的名称,后面的是在数据库中的名称 # spider 1 def get_links_from(channel, pages, who_sells=0): # td.t 没有这个就终止 #https://bj.58.com/shouji/pn2/ list_view = \'{}{}/pn{}/\'.format(channel, str(who_sells), str(pages)) wb_data = requests.get(list_view) time.sleep(1) soup = BeautifulSoup(wb_data.text, \'lxml\') #if else 为了防止https://bj.58.com/shouji/pn100/这样不存在的页面 if soup.find(\'td\', \'t\'): for link in soup.select(\'td.t a.t\'): item_link = link.get(\'href\').split(\'?\')[0] url_list.insert_one({\'url\': item_link}) #读取商品信息并且存入数据库 get_item_info(item_link) time.sleep(1) # return urls else: # It\'s the last page ! pass # spider 2 解析每一个URL def get_item_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, \'lxml\') #如果爬取URL时候还存在,但是get_item_info解析每一个URL时候恰好被买走了,那么就会出现404错误 #分析404错误的页面的源代码发现有这么一句: #<link rel="stylesheet" type="text/css" href="https://c.58cdn.com.cn/ui6/list/404news_v20161103135554.css"> #所以使用\'404\' in soup.find(\'link\', type="text/css", rel="stylesheet").get(\'href\').split(\'/\')来判断 no_longer_exist = \'404\' in soup.find(\'link\', type="text/css", rel="stylesheet").get(\'href\').split(\'/\') if no_longer_exist:#存在404错误的话就pass pass else: # title = soup.title.text.split(\'-\')[0] # # print(title) # #网页源代码中存在这样一句:<title>OPPOreno10倍变焦版 - 北京58同城</title> # price = soup.select(\'span.infocard__container__item__main__text--price\')[0].text # #<span class="infocard__container__item__main__text--price"> 360元</span> # date = soup.select(\'span.detail-title__info__text\')[0].text # #<div class="detail-title__info__text">2020-01-24 更新</div> # area = list(soup.select(\'.infocard__container__item__main a\')[0].stripped_strings) if soup.find_all(\'span\', \'infocard__container__item__main\') else None # #<div class="infocard__container__item__main"><a href=\'/chaoyang/shouji/\' target="_blank">朝阳</a></div> # #存入数据库 # item_info.insert_one({\'title\': title, \'price\': price, \'date\': date, \'area\': area, \'url\': url}) # print({\'title\': title, \'price\': price, \'date\': date, \'area\': area, \'url\': url}) if ((soup.title.text.split(\'-\')[0]=="请输入验证码 ws:36.161.10.181")|(soup.title.text.split(\'-\')[0]==\'【58同城 58.com】六安分类信息 \')): title = "" else: title = soup.title.text.split(\'-\')[0] if soup.select(\'.infocard__container__item__main__text--price\')!=[]: price = soup.select(\'.infocard__container__item__main__text--price\')[0].get_text().strip() else: price = [] # price = soup.select(\'.infocard__container__item__main__text--price\') # print(price) if soup.select(\'.detail-title__info__text\')!=[]: date = soup.select(\'.detail-title__info__text\')[0].get_text().strip() else: date = [] if soup.select(\'.infocard__container__item__main a\')!=[]: area = soup.select(\'.infocard__container__item__main a\')[0].get_text().strip() else: area = [] #area 这里还有不完善的地方:需要判断如果area不存在的话应该设置为None # if soup.find_all(\'span\', \'infocard__container__item__main\') else None item_info.insert_one({\'title\': title, \'price\': price, \'date\': date, \'area\': area, \'url\': url}) print({\'title\': title, \'price\': price, \'date\': date, \'area\': area, \'url\': url}) # get_links_from("http://bj.58.com/shouji/",2)
不知道怎么破解58的验证码反爬机制……在知乎上听大佬说,好像sleep可以解决
3.进行主函数编写+爬取次数的统计
from multiprocessing import Pool from channel_extact import channel_list from pages_parsing import get_links_from from pages_parsing import get_item_info def get_all_links_from(channel): for i in range(1,100): get_links_from(channel,i) if __name__ == \'__main__\': #多线程pool = Pool() pool = Pool() # pool = Pool(processes=6) #map方法:map(一个函数,传入该函数的值) pool.map(get_all_links_from,channel_list.split())
import time from pages_parsing import url_list while True: print(url_list.find().count()) time.sleep(4) #爬取1000条数据 if url_list.find().count()==1000: break
版权声明:本文为Whgy原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。