爬取58同城泉州二手房价格以及每平米价格信息

一·设计方案

1.爬虫名称：爬取58同城泉州二手房价格

2.爬取内容：爬取二手房地址，价格，每平米价格。

3.网络的方案概述：

实现思路：首先访问要爬取的页面寻找源代码，使用BeautifulSoup来爬取数据，将爬取到的数据进行永久保存，存入excel表格中，再对数据进行处理以及统计，再分析数据及可视化。

技术难点：有些价格在源代码当中是乱码，以及做回归方程的数学难点。

二·主题页面的结构特征分析

1.主题页面的结构与特征分析：首先找网页源代码的标签，发现要爬取的地址标签为”div class=”list-info”,然后找到所有这样的标签，再去寻找价格标签为“p class=”sum””，以及每平米价格标签“p class=”unit””。

2.页面解析

三.网络爬虫程序设计

1.数据爬取以及数据永久保存

import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import sklearn
from sklearn.linear_model import LinearRegression
import scipy.stats as sts
from scipy.optimize import leastsq
import seaborn as sns

#对url发出get请求
url=\'https://qz.58.com/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d100000-0012-357d-7e59-a5f07cbc175c&ClickID=4\'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
r = requests.get(url, timeout=30, headers=headers)
# 产生异常信息
r.raise_for_status()              
# 修改编码
r.encoding = r.apparent_encoding
html = r.text
soup=BeautifulSoup(html,\'html.parser\')

#第二种方式请求
\'\'\'res = requests.get("https://qz.58.com/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d100000-0012-357d-7e59-a5f07cbc175c&ClickID=4")
soup = bs4.BeautifulSoup(res.text,"html.parser")
targets = soup.find_all("div",class_="title")
for each in targets:
    print(each.a.span.text)\'\'\'

#爬取二手房名字
locate = []
for div in soup.find_all("div",class_="list-info"):
    locate.append(div.text)

#爬取二手房的价格
price = []
for p in soup.find_all("p",class_="sum"):
    price.append(p.text)

#爬取二手房每平米价格
money = []
for p in soup.find_all("p",class_="unit"):
    money.append(p.text)
print("{:^30}".format("泉州二手房价格"))
print("{:^5}\t{:^5}\t{:^6}\t{:^15}".format("顺序","地址","价格","每平米价格"))

#把爬取的数据打印成列表
houses = []

for i in range(20):
    print("{:^5}\t{:^5}\t{:^6}\t{:^15}".format(i+1,locate[i],price[i],money[i]))
    houses.append([i+1,locate[i],price[i],money[i]])

#对数据进行保存
df = pd.DataFrame(houses,columns = [\'顺序\',\'地址\',\'价格\',\'每平米价格\'])
df.to_excel(\'58同城二手房价格表.xlsx\')

由于爬取的地址字数太多，因此输出栏看似缭乱，故提供已转出excel表格的内容

2.读取csv文件

#读取csv文件
df = pd.DataFrame(pd.read_excel(\'58同城二手房价格表.xlsx\'))
print(df.head())

#检查有无重复值
print(df.duplicated())

 #空值与缺失值处理
a=df[\'价格\'].isnull().value_counts()
print(a)

#检查有无异常值
print(df.describe())

7.画散点图（分别是顺序与价格和顺序与每平米价格关系）

#画散点图
plt.rcParams[\'font.sans-serif\']=[\'SimHei\']#用来正常显示中文
plt.scatter(range(1,21),price[:20])
plt.xlabel(\'顺序\')
plt.ylabel(\'价格\')
plt.title(\'靠前推荐二手房前20的价格散点图\')
plt.show()

#散点图
plt.rcParams[\'font.sans-serif\']=[\'SimHei\']#用来正常显示中文
x = df.顺序
y = df.每平米价格
plt.xlabel("顺序")
plt.ylabel("每平米价格")
plt.scatter(x,y,color="purple",label="散点")
plt.title("每平米价格散点图")
plt.legend()
plt.show()

6.画折线图

#折线图
def line_diagram():
    x = df[\'顺序\']
    y = df[\'价格\']
    plt.xlabel(\'顺序\')
    plt.ylabel(\'价格\')
    plt.plot(x,y)
    plt.scatter(x,y)
    plt.title("顺序与价格折线图")
    plt.show()
line_diagram()

7.画柱状图

#画柱状图
plt.rcParams[\'font.sans-serif\']=[\'SimHei\']#用来正常显示中文
plt.bar(range(1,21),money[:20])
plt.xlabel(\'顺序\')
plt.ylabel(\'每平米价格\')
plt.title(\'靠前推荐二手房前20的价格柱状图\')
plt.show()

8.回归分析

import numpy as np
import pandas as pd
import seaborn as sns

df=pd.read_excel(\'58同城二手房价格表.xlsx\')
df.head(20)
from sklearn.linear_model import LinearRegression
X=df.drop("地址",axis=1)
predict_model=LinearRegression()
predict_model.fit(X,df[\'价格\'])
print("回归系数为:",predict_model.coef_)    #判断相关性
sns.regplot(df.价格,df.顺序)

9.附上完整代码

import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import sklearn
from sklearn.linear_model import LinearRegression
import scipy.stats as sts
from scipy.optimize import leastsq
import seaborn as sns

#对url发出get请求
url=\'https://qz.58.com/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d100000-0012-357d-7e59-a5f07cbc175c&ClickID=4\'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
r = requests.get(url, timeout=30, headers=headers)
# 产生异常信息
r.raise_for_status()              
# 修改编码
r.encoding = r.apparent_encoding
html = r.text
soup=BeautifulSoup(html,\'html.parser\')

#第二种方式请求
\'\'\'res = requests.get("https://qz.58.com/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d100000-0012-357d-7e59-a5f07cbc175c&ClickID=4")
soup = bs4.BeautifulSoup(res.text,"html.parser")
targets = soup.find_all("div",class_="title")
for each in targets:
    print(each.a.span.text)\'\'\'

#爬取二手房名字
locate = []
for div in soup.find_all("div",class_="list-info"):
    locate.append(div.text)

#爬取二手房的价格
price = []
for p in soup.find_all("p",class_="sum"):
    price.append(p.text)

#爬取二手房每平米价格
money = []
for p in soup.find_all("p",class_="unit"):
    money.append(p.text)
print("{:^30}".format("泉州二手房价格"))
print("{:^5}\t{:^5}\t{:^6}\t{:^15}".format("顺序","地址","价格","每平米价格"))

#把爬取的数据打印成列表
houses = []

for i in range(20):
    print("{:^5}\t{:^5}\t{:^6}\t{:^15}".format(i+1,locate[i],price[i],money[i]))
    houses.append([i+1,locate[i],price[i],money[i]])

#对数据进行保存
df = pd.DataFrame(houses,columns = [\'顺序\',\'地址\',\'价格\',\'每平米价格\'])
df.to_excel(\'58同城二手房价格表.xlsx\')

#读取csv文件
df = pd.DataFrame(pd.read_excel(\'58同城二手房价格表.xlsx\'))
print(df.head())

#删除无效行与列
\'\'\'df=pd.read_excel(\'58同城二手房价格表.xlsx\')
print(df)
df.drop(\'地址\'，axis=1,inplace=True)
print(pd.head())
df.drop(columns=["地址"])
df.head(20)\'\'\'

#检查有无重复值
print(df.duplicated())

 #空值与缺失值处理
a=df[\'价格\'].isnull().value_counts()
print(a)

#检查有无异常值
print(df.describe())

#画散点图
plt.rcParams[\'font.sans-serif\']=[\'SimHei\']#用来正常显示中文
plt.scatter(range(1,21),price[:20])
plt.xlabel(\'顺序\')
plt.ylabel(\'价格\')
plt.title(\'靠前推荐二手房前20的价格散点图\')
plt.show()

#散点图
plt.rcParams[\'font.sans-serif\']=[\'SimHei\']#用来正常显示中文
x = df.顺序
y = df.每平米价格
plt.xlabel("顺序")
plt.ylabel("每平米价格")
plt.scatter(x,y,color="purple",label="散点")
plt.title("每平米价格散点图")
plt.legend()
plt.show()

#折线图
def line_diagram():
    x = df[\'顺序\']
    y = df[\'价格\']
    plt.xlabel(\'顺序\')
    plt.ylabel(\'价格\')
    plt.plot(x,y)
    plt.scatter(x,y)
    plt.title("顺序与价格折线图")
    plt.show()
line_diagram()   

#分布图
\'\'\'sns.jointplot(x="顺序",y=\'价格\',data = df)

sns.jointplot(x="顺序",y=\'价格\',data = df, kind=\'reg\')

sns.jointplot(x="顺序",y=\'价格\',data = df, kind=\'hex\')

sns.jointplot(x="顺序",y=\'价格\',data = df, kind=\'kde\', space=0,color=\'p\')

sns.kdeplot(df[\'顺序\'], df[\'价格\'])\'\'\'

#画柱状图
plt.rcParams[\'font.sans-serif\']=[\'SimHei\']#用来正常显示中文
plt.bar(range(1,21),money[:20])
plt.xlabel(\'顺序\')
plt.ylabel(\'每平米价格\')
plt.title(\'靠前推荐二手房前20的价格柱状图\')
plt.show()


\'\'\'df=pd.read_excel(\'58同城二手房价格表.xlsx\')
df.head(20)
X=df.drop("地址",axis=1)
predict_model=LinearRegression()
predict_model.fit(X,df[\'价格\'])
print("回归系数为:",predict_model.coef_)    #判断相关性
sns.regplot(df.价格,df.顺序)\'\'\'

\'\'\'sns.set_style("write")
sns.lmplot(x=\'顺序\',y=\'价格\',col=\'一元线性回归方程\',data=df)
plt.show()\'\'\'


\'\'\'Y=np.array(df.顺序)
X=np.array(df.价格)

xi=X.reshape(-1,1)
yi=Y.reshape(-1,1)


model = df.LinearRegression()
model.fit(xi.reshape(-1,1),yi.rehsape(-1,1))
b=model.intercept_
a=model.caef_
print("A=",a,"B=",b)

c = model.predict([[50]])

print("价格预测为:",c)\'\'\'

#线性回归分析以及回归图
df=pd.read_excel(\'58同城二手房价格表.xlsx\')
df.head(20)
from sklearn.linear_model import LinearRegression
X=df.drop("地址",axis=1)
predict_model=LinearRegression()
predict_model.fit(X,df[\'价格\'])
#判断相关性
print("回归系数为:",predict_model.coef_)    
sns.regplot(df.价格,df.顺序)

四.结论

1.经过对二手房信息的爬取，结论是房子的价格和房子每平米价格对于不同的房地产商不同，不同分楼层不同，但是这两个变量是正比的关系。

2.此次爬虫练习后，我颇受感悟，现在的计算机科学发展进步如此之大，以前想要大量数据只能人工手写，但是自从有了网络爬虫，我们的生活会变得更加快捷，更加方便。对于我个人而言，计算机这门学科以前接触很少，印象中迷迷糊糊朦朦胧胧，自从上大学后，更加映像深刻，也学到了技术。但是还有许多不足以及对库的运用掌握不够熟练，接下来的日子里，我将会更加努力来研究这门计算机课程。

发表于
2020-04-21 16:29
许煜炘
阅读(353)
评论(0)
编辑
收藏
举报

本文链接：https://www.cnblogs.com/xyx20010705/p/12745509.html

爬取58同城泉州二手房价格以及每平米价格信息

爬取58同城泉州二手房价格以及每平米价格信息

爬取58同城泉州二手房价格以及每平米价格信息的更多相关文章

随机推荐

热门专题

目录导航