爬取链家任意城市二手房数据(天津)
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Time : 2019-08-16 12:40 4 # @Author : Anthony 5 # @Email : ianghont7@163.com 6 # @File : 爬取链家任意城市二手房数据.py 7 8 9 import requests 10 from lxml import etree 11 import time 12 import xlrd 13 import os 14 import xlwt 15 from xlutils.copy import copy 16 17 # 伪装请求 18 headers = { 19 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36\' 20 } 21 22 xlsInfo = {} 23 24 def catchHouseDetail(url): 25 # 通过requests模块模拟get请求 26 page_text = requests.get(url, headers=headers, stream=True) 27 28 # 将互联网上获取的页面数据加载到etree对象中 29 tree = etree.HTML(page_text.text) 30 31 # 定位页面标签位置装入一个list中 32 li_list = tree.xpath(\'//div[@class="leftContent"]/ul/li\') 33 all_house_list = [] 34 # 遍历列表中每一个字段 35 for li in li_list: 36 info = [] 37 # info = {} 38 # info["房屋标题"] = li.xpath(\'.//div[@class="info clear"]/div[@class="title"]/a/text()\')[0] 39 # info["小区名称"] = li.xpath(\'.//div[@class="address"]/div[@class="houseInfo"]/text()\')[0].split(\'|\')[1] 40 # info[\'建筑面积\'] = li.xpath(\'.//div[@class="address"]/div[@class="houseInfo"]/text()\')[0].split(\'|\')[2] 41 # info[\'房屋朝向\'] = li.xpath(\'.//div[@class="address"]/div[@class="houseInfo"]/text()\')[0].split(\'|\')[3] 42 # info[\'装修情况\'] = li.xpath(\'.//div[@class="address"]/div[@class="houseInfo"]/text()\')[0].split(\'|\')[4] 43 # info[\'所在楼层\'] = li.xpath(\'.//div[@class="flood"]/div[@class="positionInfo"]/text()\')[0].split(\' \')[0] 44 # info[\'所在区域\'] = li.xpath(\'.//div[@class="flood"]/div[@class="positionInfo"]/a/text()\')[0] 45 # info[\'总价\'] = li.xpath(\'.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()\')[0] + \'万\' 46 # info[\'每平米售价\'] = li.xpath(\'.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()\')[0] 47 # info[\'房屋关注人数\'] = li.xpath(\'.//div[@class="followInfo"]/text()\')[0].split(\'/\')[0] 48 # info[\'房屋发布时间\'] = li.xpath(\'.//div[@class="followInfo"]/text()\')[0].split(\'/\')[1] 49 50 #房屋标题 51 houseTitle = li.xpath(\'.//div[@class="info clear"]/div[@class="title"]/a/text()\')[0] 52 #小区名称 53 houseName = li.xpath(\'.//div[@class="address"]/div[@class="houseInfo"]/text()\')[0].split(\'|\')[1] 54 #建筑面积 55 houseArea = li.xpath(\'.//div[@class="address"]/div[@class="houseInfo"]/text()\')[0].split(\'|\')[2] 56 #房屋朝向 57 houseTowards = li.xpath(\'.//div[@class="address"]/div[@class="houseInfo"]/text()\')[0].split(\'|\')[3] 58 #装修情况 59 houseFinish = li.xpath(\'.//div[@class="address"]/div[@class="houseInfo"]/text()\')[0].split(\'|\')[4] 60 #所在楼层 61 houseFloor = li.xpath(\'.//div[@class="flood"]/div[@class="positionInfo"]/text()\')[0].split(\' \')[0] 62 #所在区域 63 houseSite = li.xpath(\'.//div[@class="flood"]/div[@class="positionInfo"]/a/text()\')[0] 64 #总价 65 housePrices = li.xpath(\'.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()\')[0] + \'万\' 66 #每平米售价 67 houseSquarePrices = li.xpath(\'.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()\')[0] 68 #房屋关注人数 69 houseFollowers = li.xpath(\'.//div[@class="followInfo"]/text()\')[0].split(\'/\')[0] 70 #房屋发布时间 71 houseTime = li.xpath(\'.//div[@class="followInfo"]/text()\')[0].split(\'/\')[1] 72 info.append(houseTitle) 73 info.append(houseName) 74 info.append(houseArea) 75 info.append(houseTowards) 76 info.append(houseFinish) 77 info.append(houseFloor) 78 info.append(houseSite) 79 info.append(housePrices) 80 info.append(houseSquarePrices) 81 info.append(houseFollowers) 82 info.append(houseTime) 83 all_house_list.append(info) 84 if if_xls_exits() == True: 85 write_excel_xls_append(xlsInfo["xlsName"],all_house_list) 86 87 88 #获取数据写入xls表格中 89 def write_excel_xls(path, sheet_name, value): 90 index = len(value) # 获取需要写入数据的行数 91 workbook = xlwt.Workbook() # 新建一个工作簿 92 sheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格 93 for i in range(0, index): 94 for j in range(0, len(value[i])): 95 sheet.write(i, j, value[i][j]) # 像表格中写入数据(对应的行和列) 96 workbook.save(path) # 保存工作簿 97 print("xls格式表格写入数据成功!") 98 99 100 101 def write_excel_xls_append(path, value): 102 index = len(value) # 获取需要写入数据的行数 103 workbook = xlrd.open_workbook(path) # 打开工作簿 104 sheets = workbook.sheet_names() # 获取工作簿中的所有表格 105 worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 106 rows_old = worksheet.nrows # 获取表格中已存在的数据的行数 107 new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象 108 new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格 109 for i in range(0, index): 110 for j in range(0, len(value[i])): 111 new_worksheet.write(i + rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入 112 new_workbook.save(path) # 保存工作簿 113 print("xls格式表格【追加】写入数据成功!") 114 115 116 117 118 def if_xls_exits(): 119 while True: 120 book_name_xls = \'天津链家二手房信息表.xls\' 121 sheet_name_xls = \'房屋信息\' 122 value_title = [["房屋标题", "房屋户型", "建筑面积", "房屋朝向", "装修情况", "所在楼层", "所在区域", "总价", "每平米售价", "房屋关注人数", "房屋发布时间"], ] 123 if os.path.exists(\'./%s\'%book_name_xls): 124 xlsInfo["xlsName"] = book_name_xls 125 return True 126 else: 127 write_excel_xls(book_name_xls, sheet_name_xls, value_title) 128 continue 129 130 131 132 def catch(): 133 pages = [\'https://tj.lianjia.com/ershoufang/pg{}/\'.format(x) for x in range(1, 1000)] 134 for page in pages: 135 try: 136 info = catchHouseDetail(page) 137 except: 138 pass 139 time.sleep(3) 140 141 142 if __name__ == \'__main__\': 143 catch()
效果图:
版权声明:本文为ipyanthony原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。