一、选题背景

刚毕业往往会为自己不知道每个职位之间各种待遇的差异而迷茫,所以为了了解毕业后职位的待遇等方面做多种参考,货比三家。

 

1.数据来源

前程无忧(https://www.51job.com/)

2.爬取内容

爬取内容包括职位名称,公司名称,地点,薪资,学历要求,以及发布日期等。

 

二、实现爬取的步骤

1.代码所需包

1 import urllib.request
2 import xlwt
3 import re
4 import urllib.parse
5 import time

 

 

2.进入前程无忧官网,搜索职位信息

 

3.打开开发者模式

 

4.模拟浏览器

1 header={
2     \'Host\':\'search.51job.com\',
3     \'Upgrade-Insecure-Requests\':\'1\',
4     \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36\'
5 }

 

 

5.为了实现爬取,我写了一个能够实现输入想了解的职位就能爬取相关内容的函数

 1 #page是页数,item是输入的字符串
 2 def getfront(page,item):
 3 #先把字符串转成十六进制编码      
 4      result = urllib.parse.quote(item)                    
 5      ur1 = result+\',2,\'+ str(page)+\'.html\'
 6      ur2 = \'https://search.51job.com/list/000000,000000,0000,00,9,99,\'
 7      res = ur2+ur1                                                            #拼接网址
 8      a = urllib.request.urlopen(res)
 9 # 读取源代码并转为unicode
10      html = a.read().decode(\'gbk\')          
11      return html
1 def getInformation(html):
2     #匹配换行符
3     reg = re.compile(r\'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?\',re.S)
4     items=re.findall(reg,html)
5     return items

除了爬取基本信息外,还把职位超链接后的网址,以及公司超链接的网址爬取下来了。

 

6.把爬取的信息以Excel文件形式储存起来,比较清晰直观。

 1 #新建表格空间
 2 excel1 = xlwt.Workbook()
 3 # 设置单元格格式
 4 sheet1 = excel1.add_sheet(\'Job\', cell_overwrite_ok=True)
 5 
 6 sheet1.write(0, 0, \'序号\')
 7 
 8 sheet1.write(0, 1, \'职位\')
 9 
10 sheet1.write(0, 2, \'公司名称\')
11 
12 sheet1.write(0, 3, \'公司地点\')
13 
14 sheet1.write(0, 4, \'公司性质\')
15 
16 sheet1.write(0, 5, \'薪资\')
17 
18 sheet1.write(0, 6, \'学历要求\')
19 
20 sheet1.write(0, 7, \'工作经验\')
21 
22 sheet1.write(0, 8, \'公司规模\')
23 
24 sheet1.write(0, 9, \'公司类型\')
25 
26 sheet1.write(0, 10,\'公司福利\')
27 
28 sheet1.write(0, 11,\'发布时间\')

 

爬取代码如下

 1 number = 1
 2 item = input()
 3 
 4 for j in range(1,1000):
 5     try:
 6         print("正在爬取第"+str(j)+"页数据...")
 7 #调用获取网页原码
 8         html = getfront(j,item)      
 9 
10         for i in getInformation(html):
11             try:
12 #职位网址
13                 url1 = i[1]          
14                 res1 = urllib.request.urlopen(url1).read().decode(\'gbk\')
15                 company = re.findall(re.compile(r\'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class="at" title="(.*?)">.*?\',re.S),res1)
16 
17                 job_need = re.findall(re.compile(r\'<p class="msg ltype".*?>.*?&nbsp;&nbsp;<span>|</span>&nbsp;&nbsp;(.*?)&nbsp;&nbsp;<span>|</span>&nbsp;&nbsp;(.*?)&nbsp;&nbsp;<span>|</span>&nbsp;&nbsp;.*?</p>\',re.S),res1)
18 
19                 welfare = re.findall(re.compile(r\'<span class="sp4">(.*?)</span>\',re.S),res1)
20                 print(i[0],i[2],i[4],i[5],company[0][0],job_need[2]
21 [0],job_need[1][0],company[0][1],company[0][2],welfare,i[6])
22                 sheet1.write(number,0,number)
23 
24                 sheet1.write(number,1,i[0])
25 
26                 sheet1.write(number,2,i[2])
27 
28                 sheet1.write(number,3,i[4])
29 
30                 sheet1.write(number,4,company[0][0])
31 
32                 sheet1.write(number,5,i[5])
33 
34                 sheet1.write(number,6,job_need[1][0])
35 
36                 sheet1.write(number,7,job_need[2][0])
37 
38                 sheet1.write(number,8,company[0][1])
39 
40                 sheet1.write(number,9,company[0][2])
41 
42                 sheet1.write(number,10,("  ".join(str(i) for i in welfare)))
43 
44                 sheet1.write(number,11,i[6])
45 
46                 number+=1
47                 excel1.save("51job.xls")
48 #休息间隔,避免爬取海量数据时被误判为攻击,IP遭到封禁
49                 time.sleep(0.3) 
50             except:
51                 pass
52     except:
53         pass

 

结果如下:

 

三、数据清洗与处理

1.先打开文件

1 #coding:utf-8
2 import pandas as pd
3 import re
4 
5 #除此之外还要安装xlrd包
6 
7 data = pd.read_excel(r\'51job.xls\',sheet_name=\'Job\')
8 result = pd.DataFrame(data)

 

清洗思路:

1、出现有空值的信息,直接删除整行

1 a = result.dropna(axis=0,how=\'any\')
2 #输出全部行,不省略
3 pd.set_option(\'display.max_rows\',None) 

 

2.职位出错(爬取职位与预想职位无关)

1 b = u\'数据\'
2 number = 1
3 li = a[\'职位\']
4 for i in range(0,len(li)):
5     try:
6         if b in li[i]:
7             #print(number,li[i])
8             number+=1
9         else:
10             a = a.drop(i,axis=0)
11     except:
12         pass

 

3.其他地方出现的信息错位,比如在学历里出现 ‘招多少人’

 1 b2= u\'\'
 2 li2 = a[\'学历要求\']
 3 for i in range(0,len(li2)):
 4     try:
 5         if b2 in li2[i]:
 6             #print(number,li2[i])
 7             number+=1
 8             a = a.drop(i,axis=0)
 9     except:
10         pass

 

 

4.转换薪资单位不一致

 1 b3 =u\'万/年\'
 2 b4 =u\'千/月\'
 3 li3 = a[\'薪资\']
 4 
 5 #注释部分的print都是为了调试用的
 6 
 7 for i in range(0,len(li3)):
 8     try:
 9         if b3 in li3[i]:
10             x = re.findall(r\'\d*\.?\d+\',li3[i])
11             #print(x)
12 
13 #转换成浮点型并保留两位小数
14             min_ = format(float(x[0])/12,\'.2f\')              
15             max_ = format(float(x[1])/12,\'.2f\')
16             li3[i][1] = min_+\'-\'+max_+u\'万/月\'
17         if b4 in li3[i]:
18             x = re.findall(r\'\d*\.?\d+\',li3[i])
19             #print(x)
20 
21             #input()
22             min_ = format(float(x[0])/10,\'.2f\')
23             max_ = format(float(x[1])/10,\'.2f\')
24             li3[i][1] = str(min_+\'-\'+max_+\'万/月\')
25         print(i,li3[i])
26 
27     except:
28         pass

 

清洗完成后保存到新的Excel文件里。

1 a.to_excel(\'51job2.xlsx\', sheet_name=\'Job\', index=False)

 

四、数据可视化

经过可视化处理能使数据更加直观,更有利于分析 甚至可以说可视化是数据挖掘最重要的内容。

1.查看需要的包

1 # -*- coding: utf-8 -*-
2 import pandas as pd
3 import re
4 from pyecharts import Funnel,Pie,Geo
5 import matplotlib.pyplot as plt

 

2.打开文件

1 file = pd.read_excel(r\'51job2.xls\',sheet_name=\'Job\')
2 f = pd.DataFrame(file)
3 pd.set_option(\'display.max_rows\',None)

3.创建多个列表来单独存放薪资,工作经验,学历要求,公司地点等信息

 1 add = f[\'公司地点\']
 2 sly = f[\'薪资\']
 3 edu = f[\'学历要求\']
 4 exp = f[\'工作经验\']
 5 address =[]
 6 salary = []
 7 education = []
 8 experience = []
 9 for i in range(0,len(f)):
10     try:
11         a = add[i].split(\'-\')
12         address.append(a[0])
13         #print(address[i])
14         s = re.findall(r\'\d*\.?\d+\',sly[i])
15         s1= float(s[0])
16         s2 =float(s[1])
17         salary.append([s1,s2])
18         #print(salary[i])
19         education.append(edu[i])
20         #print(education[i])
21         experience.append(exp[i])
22         #print(experience[i])
23     except:
24        pass

4.工作经验—薪资图 与 学历—薪资图

 1 #定义存放最低薪资的列表
 2 min_s=[]
 3 #定义存放最高薪资的列表
 4 max_s=[]
 5 for i in range(0,len(experience)):
 6     min_s.append(salary[i][0])
 7     max_s.append(salary[i][0])
 8 
 9 my_df = pd.DataFrame({\'experience\':experience, \'min_salay\' : min_s, 
10 #关联工作经验与薪资
11 \'max_salay\' : max_s})
12 data1 = my_df.groupby(\'experience\').mean()[\'min_salay\'].plot(kind=\'line\')
13 plt.show()
14 
15 my_df2 = pd.DataFrame({\'education\':education, \'min_salay\' : min_s, 
16 #关联学历与薪资
17 \'max_salay\' : max_s})
18 data2 = my_df2.groupby(\'education\').mean()[\'min_salay\'].plot(kind=\'line\')
19 plt.show()

5.学历要求圆环图

 1 def get_edu(list):
 2     education2 = {}
 3     for i in set(list):
 4         education2[i] = list.count(i)
 5     return education2
 6 dir1 = get_edu(education)
 7 
 8 # print(dir1)
 9 
10 attr= dir1.keys()
11 value = dir1.values()
12 pie = Pie("学历要求")
13 pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype=\'radius\',
14         is_legend_show=False, is_label_show=True,legend_orient=\'vertical\')
15 pie.render(\'学历要求玫瑰图.html\')

 

6.大数据城市需求地理位置分布图

 1 def get_address(list):
 2     address2 = {}
 3     for i in set(list):
 4         address2[i] = list.count(i)
 5 
 6     address2.pop(\'异地招聘\')
 7 
 8     #address2.pop(\'山东\')
 9     #address2.pop(\'怒江\')
10     #address2.pop(\'池州\')
11 
12     return address2
13 
14 dir2 = get_address(address)
15 
16 #print(dir2)
17 
18 geo = Geo("大数据人才需求分布图", title_color="#2E2E2E",
19           title_text_size=24,title_top=20,title_pos="center", width=1300,height=600)
20 
21 attr2 = dir2.keys()
22 value2 = dir2.values()
23 
24 geo.add("",attr2, value2, type="effectScatter", is_random=True, visual_range=[0, 1000], maptype=\'china\',symbol_size=8, effect_scale=5, is_visualmap=True)
25 
26 geo.render(\'大数据城市需求分布图.html\')

 

 

7.工作经验要求漏斗图

 1 def get_experience(list):
 2     experience2 = {}
 3     for i in set(list):
 4 
 5          experience2[i] = list.count(i)
 6 
 7     return experience2
 8 
 9 dir3 = get_experience(experience)
10 
11 #print(dir3)
12 
13 attr3= dir3.keys()
14 value3 = dir3.values()
15 funnel = Funnel("工作经验漏斗图",title_pos=\'center\')
16 
17 funnel.add("", attr3, value3,is_label_show=True,label_pos="inside", label_text_color="#fff",legend_orient=\'vertical\',legend_pos=\'left\')
18 
19 funnel.render(\'工作经验要求漏斗图.html\')

 

 

完整代码:

  1 import urllib.request
  2 import xlwt
  3 import re
  4 import urllib.parse
  5 import time
  6 
  7 header={
  8      \'Host\':\'search.51job.com\',
  9 
 10      \'Upgrade-Insecure-Requests\':\'1\',
 11      \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36\'
 12  }
 13 
 14 #page是页数,item是输入的字符串
 15 def getfront(page,item):
 16 #先把字符串转成十六进制编码      
 17       result = urllib.parse.quote(item) 
 18                    
 19       ur1 = result+\',2,\'+ str(page)+\'.html\'
 20       ur2 = \'https://search.51job.com/list/000000,000000,0000,00,9,99,\'
 21 #拼接网址
 22       res = ur2+ur1 
 23       a = urllib.request.urlopen(res)
 24 # 读取源代码并转为unicode
 25       html = a.read().decode(\'gbk\')          
 26       return html
 27 
 28  def getInformation(html):
 29      #匹配换行符
 30      reg = re.compile(r\'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?\',re.S)
 31      items=re.findall(reg,html)
 32      return items
 33 
 34 #新建表格空间
 35  excel1 = xlwt.Workbook()
 36 # 设置单元格格式
 37  sheet1 = excel1.add_sheet(\'Job\', cell_overwrite_ok=True)
 38  5 
 39  sheet1.write(0, 0, \'序号\')
 40  
 41  sheet1.write(0, 1, \'职位\')
 42  
 43  sheet1.write(0, 2, \'公司名称\')
 44  
 45  sheet1.write(0, 3, \'公司地点\')
 46  
 47  sheet1.write(0, 4, \'公司性质\')
 48  
 49  sheet1.write(0, 5, \'薪资\')
 50  
 51  sheet1.write(0, 6, \'学历要求\')
 52  
 53  sheet1.write(0, 7, \'工作经验\')
 54  
 55  sheet1.write(0, 8, \'公司规模\')
 56  
 57  sheet1.write(0, 9, \'公司类型\')
 58  
 59  sheet1.write(0, 10,\'公司福利\')
 60  
 61  sheet1.write(0, 11,\'发布时间\')
 62 
 63  number = 1
 64  item = input()
 65  
 66  for j in range(1,1000):
 67      try:
 68          print("正在爬取第"+str(j)+"页数据...")
 69 #调用获取网页原码 
 70         html = getfront(j,item)      
 71  
 72          for i in getInformation(html):
 73              try:
 74 #职位网址
 75                  url1 = i[1]          
 76                  res1 = urllib.request.urlopen(url1).read().decode(\'gbk\')
 77                  company = re.findall(re.compile(r\'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class="at" title="(.*?)">.*?\',re.S),res1)
 78  
 79                  job_need = re.findall(re.compile(r\'<p class="msg ltype".*?>.*?&nbsp;&nbsp;<span>|</span>&nbsp;&nbsp;(.*?)&nbsp;&nbsp;<span>|</span>&nbsp;&nbsp;(.*?)&nbsp;&nbsp;<span>|</span>&nbsp;&nbsp;.*?</p>\',re.S),res1)
 80  
 81                 welfare = re.findall(re.compile(r\'<span class="sp4">(.*?)</span>\',re.S),res1)
 82                  print(i[0],i[2],i[4],i[5],company[0][0],job_need[2]
 83  [0],job_need[1][0],company[0][1],company[0][2],welfare,i[6])
 84                  sheet1.write(number,0,number)
 85  
 86                  sheet1.write(number,1,i[0])
 87 
 88                  sheet1.write(number,2,i[2])
 89  
 90                  sheet1.write(number,3,i[4])
 91  
 92                  sheet1.write(number,4,company[0][0])
 93  
 94                  sheet1.write(number,5,i[5])
 95  
 96                  sheet1.write(number,6,job_need[1][0])
 97  
 98                  sheet1.write(number,7,job_need[2][0])
 99  
100                  sheet1.write(number,8,company[0][1])
101  
102                  sheet1.write(number,9,company[0][2])
103  
104                  sheet1.write(number,10,("  ".join(str(i) for i in welfare)))
105  
106                  sheet1.write(number,11,i[6])
107  
108                  number+=1
109                  excel1.save("51job.xls")
110 #休息间隔,避免爬取海量数据时被误判为攻击,IP遭到封禁
111                  time.sleep(0.3) 
112              except:
113                  pass
114      except:
115         pass
116 
117 #coding:utf-8
118  import pandas as pd
119  import re
120  
121 #除此之外还要安装xlrd包
122  
123  data = pd.read_excel(r\'51job.xls\',sheet_name=\'Job\')
124  result = pd.DataFrame(data)
125 
126  a = result.dropna(axis=0,how=\'any\')
127 #输出全部行,不省略
128  pd.set_option(\'display.max_rows\',None) 
129 
130  b = u\'数据\'
131  number = 1
132  li = a[\'职位\']
133  for i in range(0,len(li)):
134 
135      try:
136          if b in li[i]:
137              #print(number,li[i])
138              number+=1
139          else:
140              a = a.drop(i,axis=0)
141      except:
142          pass
143 
144  b2= u\'\'
145  li2 = a[\'学历要求\']
146  for i in range(0,len(li2)):
147 
148      try:
149          if b2 in li2[i]:
150              #print(number,li2[i])
151              number+=1
152              a = a.drop(i,axis=0)
153 
154      except:
155          pass
156 
157  b3 =u\'万/年\'
158  b4 =u\'千/月\'
159  li3 = a[\'薪资\']
160  
161  #注释部分的print都是为了调试用的
162  
163  for i in range(0,len(li3)):
164      try:
165          if b3 in li3[i]:
166              x = re.findall(r\'\d*\.?\d+\',li3[i])
167              #print(x)
168  
169  #转换成浮点型并保留两位小数
170              min_ = format(float(x[0])/12,\'.2f\')              
171              max_ = format(float(x[1])/12,\'.2f\')
172              li3[i][1] = min_+\'-\'+max_+u\'万/月\'
173 
174          if b4 in li3[i]:
175              x = re.findall(r\'\d*\.?\d+\',li3[i])
176              #print(x)
177  
178              #input()
179              min_ = format(float(x[0])/10,\'.2f\')
180              max_ = format(float(x[1])/10,\'.2f\')
181              li3[i][1] = str(min_+\'-\'+max_+\'万/月\')
182          print(i,li3[i])
183  
184      except:
185          pass
186 
187  a.to_excel(\'51job2.xlsx\', sheet_name=\'Job\', index=False)
188 
189  # -*- coding: utf-8 -*-
190  import pandas as pd
191  import re
192 
193  from pyecharts import Funnel,Pie,Geo
194  import matplotlib.pyplot as plt
195 
196  file = pd.read_excel(r\'51job2.xls\',sheet_name=\'Job\')
197  f = pd.DataFrame(file)
198 
199  pd.set_option(\'display.max_rows\',None)
200 
201  add = f[\'公司地点\']
202 
203  sly = f[\'薪资\']
204 
205  edu = f[\'学历要求\']
206 
207  exp = f[\'工作经验\']
208 
209  address =[]
210 
211  salary = []
212 
213  education = []
214 
215  experience = []
216 
217  for i in range(0,len(f)):
218      try:
219          a = add[i].split(\'-\')
220          address.append(a[0])
221 
222          #print(address[i])
223          s = re.findall(r\'\d*\.?\d+\',sly[i])
224          s1= float(s[0])
225          s2 =float(s[1])
226          salary.append([s1,s2])
227 
228          #print(salary[i])
229          education.append(edu[i])
230 
231          #print(education[i])
232          experience.append(exp[i])
233 
234          #print(experience[i])
235      except:
236         pass
237 
238  #定义存放最低薪资的列表
239  min_s=[]
240  #定义存放最高薪资的列表
241  max_s=[]
242  for i in range(0,len(experience)):
243      min_s.append(salary[i][0])
244      max_s.append(salary[i][0])
245  
246  my_df = pd.DataFrame({\'experience\':experience, \'min_salay\' : min_s, 
247  #关联工作经验与薪资
248  \'max_salay\' : max_s})
249  data1 = my_df.groupby(\'experience\').mean()[\'min_salay\'].plot(kind=\'line\')
250  plt.show()
251  
252  my_df2 = pd.DataFrame({\'education\':education, \'min_salay\' : min_s, 
253 #关联学历与薪资
254  \'max_salay\' : max_s})
255  data2 = my_df2.groupby(\'education\').mean()[\'min_salay\'].plot(kind=\'line\')
256  plt.show()
257 
258  def get_edu(list):
259      education2 = {}
260      for i in set(list):
261          education2[i] = list.count(i)
262      return education2
263  dir1 = get_edu(education)
264  
265 # print(dir1)
266  
267  attr= dir1.keys()
268  value = dir1.values()
269  pie = Pie("学历要求")
270  pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype=\'radius\',
271          is_legend_show=False, is_label_show=True,legend_orient=\'vertical\')
272  pie.render(\'学历要求玫瑰图.html\')
273 
274  def get_address(list):
275      address2 = {}
276      for i in set(list):
277          address2[i] = list.count(i)
278  
279      address2.pop(\'异地招聘\')
280  
281      #address2.pop(\'山东\')
282      #address2.pop(\'怒江\')
283      #address2.pop(\'池州\')
284  
285      return address2
286  
287  dir2 = get_address(address)
288  
289  #print(dir2)
290  
291  geo = Geo("大数据人才需求分布图", title_color="#2E2E2E",
292            title_text_size=24,title_top=20,title_pos="center", width=1300,height=600)
293  
294  attr2 = dir2.keys()
295  value2 = dir2.values()
296  
297  geo.add("",attr2, value2, type="effectScatter", is_random=True, visual_range=[0, 1000], maptype=\'china\',symbol_size=8, effect_scale=5, is_visualmap=True)
298  
299  geo.render(\'大数据城市需求分布图.html\')
300 
301  def get_experience(list):
302      experience2 = {}
303      for i in set(list):
304  
305           experience2[i] = list.count(i)
306  
307      return experience2
308  
309  dir3 = get_experience(experience)
310  
311 #print(dir3)
312  
313  attr3= dir3.keys()
314  value3 = dir3.values()
315  funnel = Funnel("工作经验漏斗图",title_pos=\'center\')
316  
317  funnel.add("", attr3, value3,is_label_show=True,label_pos="inside", label_text_color="#fff",legend_orient=\'vertical\',legend_pos=\'left\')
318  
319  funnel.render(\'工作经验要求漏斗图.html\')

 

五、总结

本次主题的爬虫因基础薄弱进行的时间较久,但结果还是好的。通过Execll文件和可视化分析可以清晰直观的了解到应聘职位的各种要求,

基本达到了想要的结果。但是pyecharts里面的图还有很多种,还是要继续慢慢发掘,加强自己的专业知识。

版权声明:本文为c1236原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/c1236/p/14907687.html