使用python抓取并分析北京链家地产二手房信息
1 import requests 2 import time 3 from bs4 import BeautifulSoup 4 5 #设置列表页URL的固定部分 6 url=\'http://bj.lianjia.com/ershoufang/\' 7 #设置页面页的可变部分 8 page=(\'pg\') 9 10 #设置请求头部信息 11 headers = {\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11\', 12 \'Accept\':\'text/html;q=0.9,*/*;q=0.8\', 13 \'Accept-Charset\':\'ISO-8859-1,utf-8;q=0.7,*;q=0.3\', 14 \'Accept-Encoding\':\'gzip\', 15 \'Connection\':\'close\', 16 \'Referer\':\'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&wd=&eqid=c3435a7d00006bd600000003582bfd1f\' 17 } 18 19 #循环抓取列表页信息 20 for i in range(1,10): 21 if i == 1: 22 i=str(i) 23 a=(url+page+i+\'/\') 24 r=requests.get(url=a,headers=headers) 25 html=r.content 26 else: 27 i=str(i) 28 a=(url+page+i+\'/\') 29 r=requests.get(url=a,headers=headers) 30 html2=r.content 31 html = html + html2 32 #每次间隔0.5秒 33 time.sleep(0.5) 34 35 #解析抓取的页面内容 36 lj=BeautifulSoup(html,\'html.parser\') 37 38 #提取房源总价 39 price=lj.find_all(\'div\',attrs={\'class\':\'priceInfo\'}) 40 tp=[] 41 for a in price: 42 totalPrice=a.span.string 43 tp.append(totalPrice) 44 45 #提取房源信息 46 houseInfo=lj.find_all(\'div\',attrs={\'class\':\'houseInfo\'}) 47 hi=[] 48 for b in houseInfo: 49 house=b.get_text() 50 hi.append(house) 51 52 #提取房源关注度 53 followInfo=lj.find_all(\'div\',attrs={\'class\':\'followInfo\'}) 54 fi=[] 55 for c in followInfo: 56 follow=c.get_text() 57 fi.append(follow) 58 59 #导入pandas库 60 import pandas as pd 61 #创建数据表 62 house=pd.DataFrame({\'totalprice\':tp,\'houseinfo\':hi,\'followinfo\':fi}) 63 #查看数据表的内容 64 house.head() 65 66 #对房源信息进行分列 67 houseinfo_split = pd.DataFrame((x.split(\'|\') for x in house.houseinfo),index=house.index,columns=[\'xiaoqu\',\'huxing\',\'mianji\',\'chaoxiang\',\'zhuangxiu\',\'dianti\']) 68 69 #查看分列结果 70 houseinfo_split.head() 71 72 #将分列结果拼接回原数据表 73 house=pd.merge(house,houseinfo_split,right_index=True, left_index=True) 74 #完成拼接后的数据表中既包含了原有字段,也包含了分列后的新增字段。 75 #查看拼接后的数据表 76 house.head() 77 78 #对房源关注度进行分列 79 followinfo_split = pd.DataFrame((x.split(\'/\') for x in house.followinfo),index=house.index,columns=[\'guanzhu\',\'daikan\',\'fabu\']) 80 #将分列后的关注度信息拼接回原数据表 81 house=pd.merge(house,followinfo_split,right_index=True, left_index=True) 82 83 #按房源户型类别进行汇总 84 huxing=house.groupby(\'huxing\')[\'huxing\'].agg(len) 85 #查看户型汇总结果 86 huxing 87 88 #导入图表库 89 import matplotlib.pyplot as plt 90 #导入数值计算库 91 import numpy as np 92 93 #用len函数计算出huxing的长度 94 l = len(huxing) 95 # 定义一个hx空数组 96 hx=[] 97 for i in range(1,len(huxing)+1): 98 99 hx.append(i) 100 101 #绘制房源户型分布条形图 102 plt.rc(\'font\', family=\'STXihei\', size=11) 103 a=np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]) 104 plt.barh(hx,huxing,color=\'#052B6C\',alpha=0.8,align=\'center\',edgecolor=\'white\') 105 plt.ylabel(\'户型\') 106 plt.xlabel(\'数量\') 107 plt.xlim(0,1300) 108 plt.ylim(0,20) 109 plt.title(\'房源户型分布情况\') 110 plt.legend([\'数量\'], loc=\'upper right\') 111 plt.grid(color=\'#95a5a6\',linestyle=\'--\', linewidth=1,axis=\'y\',alpha=0.4) 112 plt.yticks(a,(\'1室0厅\',\'1室1厅\',\'1室2厅\',\'2室0厅\',\'2室1厅\',\'2室2厅\',\'3室0厅\',\'3室1厅\',\'3室2厅\',\'3室3厅\',\'4室1厅\',\'4室2厅\',\'4室3厅\',\'5室2厅\',\'5室3厅\',\'6室1厅\',\'6室2厅\',\'7室2厅\',\'7室3厅\')) 113 plt.show() 114 115 #对房源面积进行二次分列 116 mianji_num_split = pd.DataFrame((x.split(\'平\') for x in house.mianji),index=house.index,columns=[\'mianji_num\',\'mi\']) 117 #将分列后的房源面积拼接回原数据表 118 house=pd.merge(house,mianji_num_split,right_index=True, left_index=True) 119 120 #去除mianji_num字段两端的空格 121 #house[\'mianji_num\']=house[\'mianji_num\'].map(str.strip) 122 123 #更改mianji_num字段格式为float 124 house[\'mianji_num\']=house[\'mianji_num\'].astype(float) 125 126 #查看所有房源面积的范围值 127 house[\'mianji_num\'].min(),house[\'mianji_num\'].max() 128 (18.850000000000001, 332.63) 129 130 131 #对房源面积进行分组 132 bins = [0, 50, 100, 150, 200, 250, 300, 350] 133 group_mianji = [\'小于50\', \'50-100\', \'100-150\', \'150-200\',\'200-250\',\'250-300\',\'300-350\'] 134 house[\'group_mianji\'] = pd.cut(house[\'mianji_num\'], bins, labels=group_mianji) 135 136 #按房源面积分组对房源数量进行汇总 137 group_mianji=house.groupby(\'group_mianji\')[\'group_mianji\'].agg(len) 138 139 #绘制房源面积分布图 140 plt.rc(\'font\', family=\'STXihei\', size=15) 141 a=np.array([1,2,3,4,5,6,7]) 142 plt.barh([1,2,3,4,5,6,7],group_mianji,color=\'#052B6C\',alpha=0.8,align=\'center\',edgecolor=\'white\') 143 plt.ylabel(\'面积分组\') 144 plt.xlabel(\'数量\') 145 plt.title(\'房源面积分布\') 146 plt.legend([\'数量\'], loc=\'upper right\') 147 plt.grid(color=\'#95a5a6\',linestyle=\'--\', linewidth=1,axis=\'y\',alpha=0.4) 148 plt.yticks(a,(\'小于50\', \'50-100\', \'100-150\', \'150-200\',\'200-250\',\'250-300\',\'300-350\')) 149 plt.show() 150 151 #对房源关注度进行二次分列 152 guanzhu_num_split = pd.DataFrame((x.split(\'人\') for x in house.guanzhu),index=house.index,columns=[\'guanzhu_num\',\'ren\']) 153 #将分列后的关注度数据拼接回原数据表 154 house=pd.merge(house,guanzhu_num_split,right_index=True, left_index=True) 155 #去除房源关注度字段两端的空格 156 house[\'guanzhu_num\']=house[\'guanzhu_num\'].map(str.strip) 157 #更改房源关注度及总价字段的格式 158 house[[\'guanzhu_num\',\'totalprice\']]=house[[\'guanzhu_num\',\'totalprice\']].astype(float) 159 160 #查看房源关注度的区间 161 house[\'guanzhu_num\'].min(),house[\'guanzhu_num\'].max() 162 (0.0, 725.0) 163 164 #对房源关注度进行分组 165 bins = [0, 100, 200, 300, 400, 500, 600, 700,800] 166 group_guanzhu = [\'小于100\', \'100-200\', \'200-300\', \'300-400\',\'400-500\',\'500-600\',\'600-700\',\'700-800\'] 167 house[\'group_guanzhu\'] = pd.cut(house[\'guanzhu_num\'], bins, labels=group_guanzhu) 168 group_guanzhu=house.groupby(\'group_guanzhu\')[\'group_guanzhu\'].agg(len) 169 170 #绘制房源关注度分布图 171 plt.rc(\'font\', family=\'STXihei\', size=15) 172 a=np.array([1,2,3,4,5,6,7,8]) 173 plt.barh([1,2,3,4,5,6,7,8],group_guanzhu,color=\'#052B6C\',alpha=0.8,align=\'center\',edgecolor=\'white\') 174 plt.ylabel(\'关注度分组\') 175 plt.xlabel(\'数量\') 176 plt.xlim(0,3000) 177 plt.title(\'房源关注度分布\') 178 plt.legend([\'数量\'], loc=\'upper right\') 179 plt.grid(color=\'#95a5a6\',linestyle=\'--\', linewidth=1,axis=\'y\',alpha=0.4) 180 plt.yticks(a,(\'小于100\', \'100-200\', \'200-300\', \'300-400\',\'400-500\',\'500-600\',\'600-700\',\'700-800\')) 181 plt.show() 182 183 #导入sklearn中的KMeans进行聚类分析 184 from sklearn.cluster import KMeans 185 #使用房源总价,面积和关注度三个字段进行聚类 186 house_type = np.array(house[[\'totalprice\',\'mianji_num\',\'guanzhu_num\']]) 187 #设置质心数量为3 188 clf=KMeans(n_clusters=3) 189 #计算聚类结果 190 clf=clf.fit(house_type) 191 192 #查看分类结果的中心坐标 193 clf.cluster_centers_array([[ 772.97477064, 112.02389908, 58.96330275],[ 434.51073861, 84.92950236, 61.20115244],[ 1473.26719577, 170.65402116, 43.32275132]]) 194 195 #在原数据表中标注所属类别 196 house[\'label\']= clf.labels_
版权声明:本文为leonardchen原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。