1. 1 import requests
  2. 2 from requests.exceptions import RequestException
  3. 3 from pyquery import PyQuery as pq
  4. 4 from bs4 import BeautifulSoup
  5. 5 import pymongo
  6. 6 from config import *
  7. 7 from multiprocessing import Pool
  8. 8 import time
  9. 9
  10. 10 client = pymongo.MongoClient(MONGO_URL) # 申明连接对象
  11. 11 db = client[MONGO_DB] # 申明数据库
  12. 12
  13. 13 def get_one_page_html(url): # 获取网站每一页的html
  14. 14 headers = {
  15. 15 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
  16. 16 "Chrome/85.0.4183.121 Safari/537.36"
  17. 17 }
  18. 18 try:
  19. 19 response = requests.get(url, headers=headers)
  20. 20 if response.status_code == 200:
  21. 21 return response.text
  22. 22 else:
  23. 23 return None
  24. 24 except RequestException:
  25. 25 return None
  26. 26
  27. 27
  28. 28 def get_room_url(html): # 获取当前页面上所有room_info的url
  29. 29 soup = BeautifulSoup(html, \'lxml\')
  30. 30 addresses = soup.find_all(\'p\', {\'class\': \'content__list--item--des\'})
  31. 31 doc = pq(html)
  32. 32 room_urls = doc(\'.content__list--item--main .content__list--item--title a\').items()
  33. 33 return room_urls, addresses
  34. 34
  35. 35
  36. 36 def parser_room_page(room_html, address_queue1, address_queue2, address_queue3): # 对租房详情页面进行解析,获取特定信息
  37. 37 soup = BeautifulSoup(room_html, \'lxml\')
  38. 38 pinpai = soup.find(\'p\', {\'class\': \'content__aside__list--subtitle oneline\'}).text.strip().split(\' \')[0]
  39. 39 price = soup.find_all(\'li\', {\'class\': \'table_col\'})
  40. 40 zujin = price[6].text # 租金
  41. 41 yajin = price[7].text # 押金
  42. 42 fuwufei = price[8].text # 服务费
  43. 43 zhongjiefei = price[9].text # 中介费
  44. 44 house_type = soup.find(\'ul\', {\'class\': \'content__aside__list\'}).find_all(\'li\')[1].text[5:11] # 户型
  45. 45 x = soup.find_all(\'li\', {\'class\': \'fl oneline\'})
  46. 46 area = x[1].text[3:] # 面积
  47. 47 floor = x[7].text[3:] # 楼层
  48. 48 direction = x[2].text[3:] # 朝向
  49. 49 elevator = x[8].text[3:] # 有无电梯
  50. 50 carport = x[10].text[3:] # 有无车位
  51. 51 tenancy = x[18].text[3:] # 租期
  52. 52 maintenance = x[4].text[3:] # 维护日期
  53. 53 kanfang = x[21].text[3:] # 看房是否要预约
  54. 54 tags = soup.find(\'p\', {\'class\': \'content__aside--tags\'}).get_text().replace(\'\n\', \'\') # 标签
  55. 55
  56. 56 yield {
  57. 57 \'pinpai\': pinpai,
  58. 58 \'zujin\': zujin,
  59. 59 \'yajin\': yajin,
  60. 60 \'fuwufei\': fuwufei,
  61. 61 \'zhongjiefei\': zhongjiefei,
  62. 62 \'house_type\': house_type,
  63. 63 \'area\': area,
  64. 64 \'floor\': floor,
  65. 65 \'direction\': direction,
  66. 66 \'elevator\': elevator,
  67. 67 \'carport\': carport,
  68. 68 \'tenancy\': tenancy,
  69. 69 \'maintenance\': maintenance,
  70. 70 \'kanfang\': kanfang,
  71. 71 \'location1\': address_queue1.pop(),
  72. 72 \'location2\': address_queue2.pop(),
  73. 73 \'location3\': address_queue3.pop(),
  74. 74 \'tags\': tags,
  75. 75 }
  76. 76
  77. 77
  78. 78 def save_to_mongo(result):
  79. 79 if db[MONGO_TABLE].insert_one(result):
  80. 80 print(\'存储到mongodb成功\', result)
  81. 81 return True
  82. 82 return False
  83. 83
  84. 84
  85. 85 def main(page):
  86. 86 url = \'http://sz.xxxxx.com/zufang/pg\' + str(page) + \'rt200600000002/#contentList\'
  87. 87 html = get_one_page_html(url)
  88. 88 room_urls, addresses = get_room_url(html)
  89. 89 address_queue1 = [] # 采用队列数据结构,先进先出,用来存放租房区域(南山区、福田区等)
  90. 90 address_queue2 = []
  91. 91 address_queue3 = [] # 采用队列数据结构,先进先出,用来存放租房具体小区
  92. 92 for address in addresses:
  93. 93 temp = address.find_all(\'a\')
  94. 94 address_queue1.insert(0, temp[0].text)
  95. 95 address_queue2.insert(0, temp[1].text)
  96. 96 address_queue3.insert(0, temp[2].text)
  97. 97 for room_url in room_urls:
  98. 98 room_url_href = room_url.attr(\'href\')
  99. 99 room_url_href = \'http://sz.xxxxx.com/\' + room_url_href
  100. 100 room_html = get_one_page_html(room_url_href)
  101. 101 if room_html is None: # 非常重要,否则room_html为None时会报错
  102. 102 pass
  103. 103 else:
  104. 104 # parser_room_page(room_html, address_queue1, address_queue2, address_queue3)
  105. 105 results = parser_room_page(room_html, address_queue1, address_queue2, address_queue3)
  106. 106 for result in results:
  107. 107 save_to_mongo(result)
  108. 108
  109. 109 if __name__ == \'__main__\':
  110. 110 time1 = time.time()
  111. 111 pool = Pool() # 使用多进程提高爬取效率
  112. 112 pool.map(main, [i for i in range(1, 101)])
  113. 113 time2 = time.time()
  114. 114 print(time2 - time1) # 耗时

 

版权声明:本文为chang2021原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/chang2021/p/14073855.html