利用requests、pyquery、BeautifulSoup爬取深圳市某租房网站的租房信息
1 import requests 2 from requests.exceptions import RequestException 3 from pyquery import PyQuery as pq 4 from bs4 import BeautifulSoup 5 import pymongo 6 from config import * 7 from multiprocessing import Pool 8 import time 9 10 client = pymongo.MongoClient(MONGO_URL) # 申明连接对象 11 db = client[MONGO_DB] # 申明数据库 12 13 def get_one_page_html(url): # 获取网站每一页的html 14 headers = { 15 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 16 "Chrome/85.0.4183.121 Safari/537.36" 17 } 18 try: 19 response = requests.get(url, headers=headers) 20 if response.status_code == 200: 21 return response.text 22 else: 23 return None 24 except RequestException: 25 return None 26 27 28 def get_room_url(html): # 获取当前页面上所有room_info的url 29 soup = BeautifulSoup(html, \'lxml\') 30 addresses = soup.find_all(\'p\', {\'class\': \'content__list--item--des\'}) 31 doc = pq(html) 32 room_urls = doc(\'.content__list--item--main .content__list--item--title a\').items() 33 return room_urls, addresses 34 35 36 def parser_room_page(room_html, address_queue1, address_queue2, address_queue3): # 对租房详情页面进行解析,获取特定信息 37 soup = BeautifulSoup(room_html, \'lxml\') 38 pinpai = soup.find(\'p\', {\'class\': \'content__aside__list--subtitle oneline\'}).text.strip().split(\' \')[0] 39 price = soup.find_all(\'li\', {\'class\': \'table_col\'}) 40 zujin = price[6].text # 租金 41 yajin = price[7].text # 押金 42 fuwufei = price[8].text # 服务费 43 zhongjiefei = price[9].text # 中介费 44 house_type = soup.find(\'ul\', {\'class\': \'content__aside__list\'}).find_all(\'li\')[1].text[5:11] # 户型 45 x = soup.find_all(\'li\', {\'class\': \'fl oneline\'}) 46 area = x[1].text[3:] # 面积 47 floor = x[7].text[3:] # 楼层 48 direction = x[2].text[3:] # 朝向 49 elevator = x[8].text[3:] # 有无电梯 50 carport = x[10].text[3:] # 有无车位 51 tenancy = x[18].text[3:] # 租期 52 maintenance = x[4].text[3:] # 维护日期 53 kanfang = x[21].text[3:] # 看房是否要预约 54 tags = soup.find(\'p\', {\'class\': \'content__aside--tags\'}).get_text().replace(\'\n\', \'\') # 标签 55 56 yield { 57 \'pinpai\': pinpai, 58 \'zujin\': zujin, 59 \'yajin\': yajin, 60 \'fuwufei\': fuwufei, 61 \'zhongjiefei\': zhongjiefei, 62 \'house_type\': house_type, 63 \'area\': area, 64 \'floor\': floor, 65 \'direction\': direction, 66 \'elevator\': elevator, 67 \'carport\': carport, 68 \'tenancy\': tenancy, 69 \'maintenance\': maintenance, 70 \'kanfang\': kanfang, 71 \'location1\': address_queue1.pop(), 72 \'location2\': address_queue2.pop(), 73 \'location3\': address_queue3.pop(), 74 \'tags\': tags, 75 } 76 77 78 def save_to_mongo(result): 79 if db[MONGO_TABLE].insert_one(result): 80 print(\'存储到mongodb成功\', result) 81 return True 82 return False 83 84 85 def main(page): 86 url = \'http://sz.xxxxx.com/zufang/pg\' + str(page) + \'rt200600000002/#contentList\' 87 html = get_one_page_html(url) 88 room_urls, addresses = get_room_url(html) 89 address_queue1 = [] # 采用队列数据结构,先进先出,用来存放租房区域(南山区、福田区等) 90 address_queue2 = [] 91 address_queue3 = [] # 采用队列数据结构,先进先出,用来存放租房具体小区 92 for address in addresses: 93 temp = address.find_all(\'a\') 94 address_queue1.insert(0, temp[0].text) 95 address_queue2.insert(0, temp[1].text) 96 address_queue3.insert(0, temp[2].text) 97 for room_url in room_urls: 98 room_url_href = room_url.attr(\'href\') 99 room_url_href = \'http://sz.xxxxx.com/\' + room_url_href 100 room_html = get_one_page_html(room_url_href) 101 if room_html is None: # 非常重要,否则room_html为None时会报错 102 pass 103 else: 104 # parser_room_page(room_html, address_queue1, address_queue2, address_queue3) 105 results = parser_room_page(room_html, address_queue1, address_queue2, address_queue3) 106 for result in results: 107 save_to_mongo(result) 108 109 if __name__ == \'__main__\': 110 time1 = time.time() 111 pool = Pool() # 使用多进程提高爬取效率 112 pool.map(main, [i for i in range(1, 101)]) 113 time2 = time.time() 114 print(time2 - time1) # 耗时
版权声明:本文为chang2021原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。