1 import requests
  2 from requests.exceptions import RequestException
  3 from pyquery import PyQuery as pq
  4 from bs4 import BeautifulSoup
  5 import pymongo
  6 from config import *
  7 from multiprocessing import Pool
  8 import time
  9 
 10 client = pymongo.MongoClient(MONGO_URL)    # 申明连接对象
 11 db = client[MONGO_DB]    # 申明数据库
 12 
 13 def get_one_page_html(url):    # 获取网站每一页的html
 14     headers = {
 15         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
 16                       "Chrome/85.0.4183.121 Safari/537.36"
 17     }
 18     try:
 19         response = requests.get(url, headers=headers)
 20         if response.status_code == 200:
 21             return response.text
 22         else:
 23             return None
 24     except RequestException:
 25         return None
 26 
 27 
 28 def get_room_url(html):    # 获取当前页面上所有room_info的url
 29     soup = BeautifulSoup(html, \'lxml\')
 30     addresses = soup.find_all(\'p\', {\'class\': \'content__list--item--des\'})
 31     doc = pq(html)
 32     room_urls = doc(\'.content__list--item--main .content__list--item--title a\').items()
 33     return room_urls, addresses
 34 
 35 
 36 def parser_room_page(room_html, address_queue1, address_queue2, address_queue3):    # 对租房详情页面进行解析,获取特定信息
 37     soup = BeautifulSoup(room_html, \'lxml\')
 38     pinpai = soup.find(\'p\', {\'class\': \'content__aside__list--subtitle oneline\'}).text.strip().split(\' \')[0]
 39     price = soup.find_all(\'li\', {\'class\': \'table_col\'})
 40     zujin = price[6].text    # 租金
 41     yajin = price[7].text    # 押金
 42     fuwufei = price[8].text    # 服务费
 43     zhongjiefei = price[9].text    # 中介费
 44     house_type = soup.find(\'ul\', {\'class\': \'content__aside__list\'}).find_all(\'li\')[1].text[5:11]    # 户型
 45     x = soup.find_all(\'li\', {\'class\': \'fl oneline\'})
 46     area = x[1].text[3:]  # 面积
 47     floor = x[7].text[3:]    # 楼层
 48     direction = x[2].text[3:]    # 朝向
 49     elevator = x[8].text[3:]    # 有无电梯
 50     carport = x[10].text[3:]    # 有无车位
 51     tenancy = x[18].text[3:]    # 租期
 52     maintenance = x[4].text[3:]    # 维护日期
 53     kanfang = x[21].text[3:]   # 看房是否要预约
 54     tags = soup.find(\'p\', {\'class\': \'content__aside--tags\'}).get_text().replace(\'\n\', \'\')    # 标签
 55 
 56     yield {
 57         \'pinpai\': pinpai,
 58         \'zujin\': zujin,
 59         \'yajin\': yajin,
 60         \'fuwufei\': fuwufei,
 61         \'zhongjiefei\': zhongjiefei,
 62         \'house_type\': house_type,
 63         \'area\': area,
 64         \'floor\': floor,
 65         \'direction\': direction,
 66         \'elevator\': elevator,
 67         \'carport\': carport,
 68         \'tenancy\': tenancy,
 69         \'maintenance\': maintenance,
 70         \'kanfang\': kanfang,
 71         \'location1\': address_queue1.pop(),
 72         \'location2\': address_queue2.pop(),
 73         \'location3\': address_queue3.pop(),
 74         \'tags\': tags,
 75     }
 76 
 77 
 78 def save_to_mongo(result):
 79     if db[MONGO_TABLE].insert_one(result):
 80         print(\'存储到mongodb成功\', result)
 81         return True
 82     return False
 83 
 84 
 85 def main(page):
 86     url = \'http://sz.xxxxx.com/zufang/pg\' + str(page) + \'rt200600000002/#contentList\'
 87     html = get_one_page_html(url)
 88     room_urls, addresses = get_room_url(html)
 89     address_queue1 = []    # 采用队列数据结构,先进先出,用来存放租房区域(南山区、福田区等)
 90     address_queue2 = []
 91     address_queue3 = []    # 采用队列数据结构,先进先出,用来存放租房具体小区
 92     for address in addresses:
 93         temp = address.find_all(\'a\')
 94         address_queue1.insert(0, temp[0].text)
 95         address_queue2.insert(0, temp[1].text)
 96         address_queue3.insert(0, temp[2].text)
 97     for room_url in room_urls:
 98         room_url_href = room_url.attr(\'href\')
 99         room_url_href = \'http://sz.xxxxx.com/\' + room_url_href
100         room_html = get_one_page_html(room_url_href)
101         if room_html is None:    # 非常重要,否则room_html为None时会报错
102             pass
103         else:
104             # parser_room_page(room_html, address_queue1, address_queue2, address_queue3)
105             results = parser_room_page(room_html, address_queue1, address_queue2, address_queue3)
106             for result in results:
107                 save_to_mongo(result)
108 
109 if __name__ == \'__main__\':
110     time1 = time.time()
111     pool = Pool()  # 使用多进程提高爬取效率
112     pool.map(main, [i for i in range(1, 101)])
113     time2 = time.time()
114     print(time2 - time1)    # 耗时

 

版权声明:本文为chang2021原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/chang2021/p/14073855.html