python爬虫爬取链家二手房信息
#coding=utf-8 import requests from fake_useragent import UserAgent from bs4 import BeautifulSoup import json import csv import time # 构建请求头 userAgent = UserAgent() headers = { \'user-agent\': userAgent .Chrome } # 声明一个列表存储字典 data_list = [] def start_spider(page): #设置重连次数 requests.adapters.DEFAULT_RETRIES = 15 s = requests.session() #设置连接活跃状态为False s.keep_alive = False #爬取的url,默认爬取的南京的链家房产信息 url = \'https://nj.lianjia.com/ershoufang/pg{}/\'.format(page) # 请求url resp = requests.get(url, headers=headers,timeout=10) # 讲返回体转换成Beautiful soup = BeautifulSoup(resp.content, \'lxml\') # 筛选全部的li标签 sellListContent = soup.select(\'.sellListContent li.LOGCLICKDATA\') # 循环遍历 for sell in sellListContent: try: # 标题 title = sell.select(\'div.title a\')[0].string # 先抓取全部的div信息,再针对每一条进行提取 houseInfo = list(sell.select(\'div.houseInfo\')[0].stripped_strings) # 楼盘名字 loupan = houseInfo[0] #对楼盘的信息进行分割 info = houseInfo[0].split(\'|\') # 房子类型 house_type = info[1].strip() # 面积大小 area = info[2].strip() # 房间朝向 toward = info[3].strip() # 装修类型 renovation = info[4].strip() # 房屋地址 positionInfo = \'\'.join(list(sell.select(\'div.positionInfo\')[0].stripped_strings)) # 房屋总价 totalPrice = \'\'.join(list(sell.select(\'div.totalPrice\')[0].stripped_strings)) # 房屋单价 unitPrice = list(sell.select(\'div.unitPrice\')[0].stripped_strings)[0] # 声明一个字典存储数据 data_dict = {} data_dict[\'title\'] = title data_dict[\'loupan\'] = loupan data_dict[\'house_type\'] = house_type data_dict[\'area\'] = area data_dict[\'toward\'] = toward data_dict[\'renovation\'] = renovation data_dict[\'positionInfo\'] = positionInfo data_dict[\'totalPrice\'] = totalPrice data_dict[\'unitPrice\'] = unitPrice data_list.append(data_dict) except Exception as e: print(e) continue def main(): # 只爬取10页 for page in range(1, 10): start_spider(page) time.sleep(3) # 将数据写入json文件 with open(\'data_json.json\', \'a+\', encoding=\'utf-8\') as f: json.dump(data_list, f, ensure_ascii=False, indent=4) print(\'json文件写入完成\') # 将数据写入csv文件 with open(\'./data_csv.csv\', \'w\', encoding=\'utf-8\', newline=\'\') as f: # 表头 print(data_list) title = data_list[0].keys() # 创建writer对象 writer = csv.DictWriter(f, title) # 写入表头 writer.writeheader() # 批量写入数据 writer.writerows(data_list) print(\'csv文件写入完成\') if __name__ == \'__main__\': main()
版权声明:本文为wuli1427102168原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。