Python爬取前程无忧职位信息
一、选题背景
刚毕业往往会为自己不知道每个职位之间各种待遇的差异而迷茫,所以为了了解毕业后职位的待遇等方面做多种参考,货比三家。
1.数据来源
前程无忧(https://www.51job.com/)
2.爬取内容
爬取内容包括职位名称,公司名称,地点,薪资,学历要求,以及发布日期等。
二、实现爬取的步骤
1.代码所需包
1 import urllib.request 2 import xlwt 3 import re 4 import urllib.parse 5 import time
2.进入前程无忧官网,搜索职位信息
3.打开开发者模式
4.模拟浏览器
1 header={ 2 \'Host\':\'search.51job.com\', 3 \'Upgrade-Insecure-Requests\':\'1\', 4 \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36\' 5 }
5.为了实现爬取,我写了一个能够实现输入想了解的职位就能爬取相关内容的函数
1 #page是页数,item是输入的字符串 2 def getfront(page,item): 3 #先把字符串转成十六进制编码 4 result = urllib.parse.quote(item) 5 ur1 = result+\',2,\'+ str(page)+\'.html\' 6 ur2 = \'https://search.51job.com/list/000000,000000,0000,00,9,99,\' 7 res = ur2+ur1 #拼接网址 8 a = urllib.request.urlopen(res) 9 # 读取源代码并转为unicode 10 html = a.read().decode(\'gbk\') 11 return html
1 def getInformation(html): 2 #匹配换行符 3 reg = re.compile(r\'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?\',re.S) 4 items=re.findall(reg,html) 5 return items
除了爬取基本信息外,还把职位超链接后的网址,以及公司超链接的网址爬取下来了。
6.把爬取的信息以Excel文件形式储存起来,比较清晰直观。
1 #新建表格空间 2 excel1 = xlwt.Workbook() 3 # 设置单元格格式 4 sheet1 = excel1.add_sheet(\'Job\', cell_overwrite_ok=True) 5 6 sheet1.write(0, 0, \'序号\') 7 8 sheet1.write(0, 1, \'职位\') 9 10 sheet1.write(0, 2, \'公司名称\') 11 12 sheet1.write(0, 3, \'公司地点\') 13 14 sheet1.write(0, 4, \'公司性质\') 15 16 sheet1.write(0, 5, \'薪资\') 17 18 sheet1.write(0, 6, \'学历要求\') 19 20 sheet1.write(0, 7, \'工作经验\') 21 22 sheet1.write(0, 8, \'公司规模\') 23 24 sheet1.write(0, 9, \'公司类型\') 25 26 sheet1.write(0, 10,\'公司福利\') 27 28 sheet1.write(0, 11,\'发布时间\')
爬取代码如下
1 number = 1 2 item = input() 3 4 for j in range(1,1000): 5 try: 6 print("正在爬取第"+str(j)+"页数据...") 7 #调用获取网页原码 8 html = getfront(j,item) 9 10 for i in getInformation(html): 11 try: 12 #职位网址 13 url1 = i[1] 14 res1 = urllib.request.urlopen(url1).read().decode(\'gbk\') 15 company = re.findall(re.compile(r\'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class="at" title="(.*?)">.*?\',re.S),res1) 16 17 job_need = re.findall(re.compile(r\'<p class="msg ltype".*?>.*? <span>|</span> (.*?) <span>|</span> (.*?) <span>|</span> .*?</p>\',re.S),res1) 18 19 welfare = re.findall(re.compile(r\'<span class="sp4">(.*?)</span>\',re.S),res1) 20 print(i[0],i[2],i[4],i[5],company[0][0],job_need[2] 21 [0],job_need[1][0],company[0][1],company[0][2],welfare,i[6]) 22 sheet1.write(number,0,number) 23 24 sheet1.write(number,1,i[0]) 25 26 sheet1.write(number,2,i[2]) 27 28 sheet1.write(number,3,i[4]) 29 30 sheet1.write(number,4,company[0][0]) 31 32 sheet1.write(number,5,i[5]) 33 34 sheet1.write(number,6,job_need[1][0]) 35 36 sheet1.write(number,7,job_need[2][0]) 37 38 sheet1.write(number,8,company[0][1]) 39 40 sheet1.write(number,9,company[0][2]) 41 42 sheet1.write(number,10,(" ".join(str(i) for i in welfare))) 43 44 sheet1.write(number,11,i[6]) 45 46 number+=1 47 excel1.save("51job.xls") 48 #休息间隔,避免爬取海量数据时被误判为攻击,IP遭到封禁 49 time.sleep(0.3) 50 except: 51 pass 52 except: 53 pass
结果如下:
三、数据清洗与处理
1.先打开文件
1 #coding:utf-8 2 import pandas as pd 3 import re 4 5 #除此之外还要安装xlrd包 6 7 data = pd.read_excel(r\'51job.xls\',sheet_name=\'Job\') 8 result = pd.DataFrame(data)
清洗思路:
1、出现有空值的信息,直接删除整行
1 a = result.dropna(axis=0,how=\'any\') 2 #输出全部行,不省略 3 pd.set_option(\'display.max_rows\',None)
2.职位出错(爬取职位与预想职位无关)
1 b = u\'数据\' 2 number = 1 3 li = a[\'职位\'] 4 for i in range(0,len(li)): 5 try: 6 if b in li[i]: 7 #print(number,li[i]) 8 number+=1 9 else: 10 a = a.drop(i,axis=0) 11 except: 12 pass
3.其他地方出现的信息错位,比如在学历里出现 ‘招多少人’
1 b2= u\'人\' 2 li2 = a[\'学历要求\'] 3 for i in range(0,len(li2)): 4 try: 5 if b2 in li2[i]: 6 #print(number,li2[i]) 7 number+=1 8 a = a.drop(i,axis=0) 9 except: 10 pass
4.转换薪资单位不一致
1 b3 =u\'万/年\' 2 b4 =u\'千/月\' 3 li3 = a[\'薪资\'] 4 5 #注释部分的print都是为了调试用的 6 7 for i in range(0,len(li3)): 8 try: 9 if b3 in li3[i]: 10 x = re.findall(r\'\d*\.?\d+\',li3[i]) 11 #print(x) 12 13 #转换成浮点型并保留两位小数 14 min_ = format(float(x[0])/12,\'.2f\') 15 max_ = format(float(x[1])/12,\'.2f\') 16 li3[i][1] = min_+\'-\'+max_+u\'万/月\' 17 if b4 in li3[i]: 18 x = re.findall(r\'\d*\.?\d+\',li3[i]) 19 #print(x) 20 21 #input() 22 min_ = format(float(x[0])/10,\'.2f\') 23 max_ = format(float(x[1])/10,\'.2f\') 24 li3[i][1] = str(min_+\'-\'+max_+\'万/月\') 25 print(i,li3[i]) 26 27 except: 28 pass
清洗完成后保存到新的Excel文件里。
1 a.to_excel(\'51job2.xlsx\', sheet_name=\'Job\', index=False)
四、数据可视化
经过可视化处理能使数据更加直观,更有利于分析 甚至可以说可视化是数据挖掘最重要的内容。
1.查看需要的包
1 # -*- coding: utf-8 -*- 2 import pandas as pd 3 import re 4 from pyecharts import Funnel,Pie,Geo 5 import matplotlib.pyplot as plt
2.打开文件
1 file = pd.read_excel(r\'51job2.xls\',sheet_name=\'Job\') 2 f = pd.DataFrame(file) 3 pd.set_option(\'display.max_rows\',None)
3.创建多个列表来单独存放薪资,工作经验,学历要求,公司地点等信息
1 add = f[\'公司地点\'] 2 sly = f[\'薪资\'] 3 edu = f[\'学历要求\'] 4 exp = f[\'工作经验\'] 5 address =[] 6 salary = [] 7 education = [] 8 experience = [] 9 for i in range(0,len(f)): 10 try: 11 a = add[i].split(\'-\') 12 address.append(a[0]) 13 #print(address[i]) 14 s = re.findall(r\'\d*\.?\d+\',sly[i]) 15 s1= float(s[0]) 16 s2 =float(s[1]) 17 salary.append([s1,s2]) 18 #print(salary[i]) 19 education.append(edu[i]) 20 #print(education[i]) 21 experience.append(exp[i]) 22 #print(experience[i]) 23 except: 24 pass
4.工作经验—薪资图 与 学历—薪资图
1 #定义存放最低薪资的列表 2 min_s=[] 3 #定义存放最高薪资的列表 4 max_s=[] 5 for i in range(0,len(experience)): 6 min_s.append(salary[i][0]) 7 max_s.append(salary[i][0]) 8 9 my_df = pd.DataFrame({\'experience\':experience, \'min_salay\' : min_s, 10 #关联工作经验与薪资 11 \'max_salay\' : max_s}) 12 data1 = my_df.groupby(\'experience\').mean()[\'min_salay\'].plot(kind=\'line\') 13 plt.show() 14 15 my_df2 = pd.DataFrame({\'education\':education, \'min_salay\' : min_s, 16 #关联学历与薪资 17 \'max_salay\' : max_s}) 18 data2 = my_df2.groupby(\'education\').mean()[\'min_salay\'].plot(kind=\'line\') 19 plt.show()
5.学历要求圆环图
1 def get_edu(list): 2 education2 = {} 3 for i in set(list): 4 education2[i] = list.count(i) 5 return education2 6 dir1 = get_edu(education) 7 8 # print(dir1) 9 10 attr= dir1.keys() 11 value = dir1.values() 12 pie = Pie("学历要求") 13 pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype=\'radius\', 14 is_legend_show=False, is_label_show=True,legend_orient=\'vertical\') 15 pie.render(\'学历要求玫瑰图.html\')
6.大数据城市需求地理位置分布图
1 def get_address(list): 2 address2 = {} 3 for i in set(list): 4 address2[i] = list.count(i) 5 6 address2.pop(\'异地招聘\') 7 8 #address2.pop(\'山东\') 9 #address2.pop(\'怒江\') 10 #address2.pop(\'池州\') 11 12 return address2 13 14 dir2 = get_address(address) 15 16 #print(dir2) 17 18 geo = Geo("大数据人才需求分布图", title_color="#2E2E2E", 19 title_text_size=24,title_top=20,title_pos="center", width=1300,height=600) 20 21 attr2 = dir2.keys() 22 value2 = dir2.values() 23 24 geo.add("",attr2, value2, type="effectScatter", is_random=True, visual_range=[0, 1000], maptype=\'china\',symbol_size=8, effect_scale=5, is_visualmap=True) 25 26 geo.render(\'大数据城市需求分布图.html\')
7.工作经验要求漏斗图
1 def get_experience(list): 2 experience2 = {} 3 for i in set(list): 4 5 experience2[i] = list.count(i) 6 7 return experience2 8 9 dir3 = get_experience(experience) 10 11 #print(dir3) 12 13 attr3= dir3.keys() 14 value3 = dir3.values() 15 funnel = Funnel("工作经验漏斗图",title_pos=\'center\') 16 17 funnel.add("", attr3, value3,is_label_show=True,label_pos="inside", label_text_color="#fff",legend_orient=\'vertical\',legend_pos=\'left\') 18 19 funnel.render(\'工作经验要求漏斗图.html\')
完整代码:
1 import urllib.request 2 import xlwt 3 import re 4 import urllib.parse 5 import time 6 7 header={ 8 \'Host\':\'search.51job.com\', 9 10 \'Upgrade-Insecure-Requests\':\'1\', 11 \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36\' 12 } 13 14 #page是页数,item是输入的字符串 15 def getfront(page,item): 16 #先把字符串转成十六进制编码 17 result = urllib.parse.quote(item) 18 19 ur1 = result+\',2,\'+ str(page)+\'.html\' 20 ur2 = \'https://search.51job.com/list/000000,000000,0000,00,9,99,\' 21 #拼接网址 22 res = ur2+ur1 23 a = urllib.request.urlopen(res) 24 # 读取源代码并转为unicode 25 html = a.read().decode(\'gbk\') 26 return html 27 28 def getInformation(html): 29 #匹配换行符 30 reg = re.compile(r\'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?\',re.S) 31 items=re.findall(reg,html) 32 return items 33 34 #新建表格空间 35 excel1 = xlwt.Workbook() 36 # 设置单元格格式 37 sheet1 = excel1.add_sheet(\'Job\', cell_overwrite_ok=True) 38 5 39 sheet1.write(0, 0, \'序号\') 40 41 sheet1.write(0, 1, \'职位\') 42 43 sheet1.write(0, 2, \'公司名称\') 44 45 sheet1.write(0, 3, \'公司地点\') 46 47 sheet1.write(0, 4, \'公司性质\') 48 49 sheet1.write(0, 5, \'薪资\') 50 51 sheet1.write(0, 6, \'学历要求\') 52 53 sheet1.write(0, 7, \'工作经验\') 54 55 sheet1.write(0, 8, \'公司规模\') 56 57 sheet1.write(0, 9, \'公司类型\') 58 59 sheet1.write(0, 10,\'公司福利\') 60 61 sheet1.write(0, 11,\'发布时间\') 62 63 number = 1 64 item = input() 65 66 for j in range(1,1000): 67 try: 68 print("正在爬取第"+str(j)+"页数据...") 69 #调用获取网页原码 70 html = getfront(j,item) 71 72 for i in getInformation(html): 73 try: 74 #职位网址 75 url1 = i[1] 76 res1 = urllib.request.urlopen(url1).read().decode(\'gbk\') 77 company = re.findall(re.compile(r\'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class="at" title="(.*?)">.*?\',re.S),res1) 78 79 job_need = re.findall(re.compile(r\'<p class="msg ltype".*?>.*? <span>|</span> (.*?) <span>|</span> (.*?) <span>|</span> .*?</p>\',re.S),res1) 80 81 welfare = re.findall(re.compile(r\'<span class="sp4">(.*?)</span>\',re.S),res1) 82 print(i[0],i[2],i[4],i[5],company[0][0],job_need[2] 83 [0],job_need[1][0],company[0][1],company[0][2],welfare,i[6]) 84 sheet1.write(number,0,number) 85 86 sheet1.write(number,1,i[0]) 87 88 sheet1.write(number,2,i[2]) 89 90 sheet1.write(number,3,i[4]) 91 92 sheet1.write(number,4,company[0][0]) 93 94 sheet1.write(number,5,i[5]) 95 96 sheet1.write(number,6,job_need[1][0]) 97 98 sheet1.write(number,7,job_need[2][0]) 99 100 sheet1.write(number,8,company[0][1]) 101 102 sheet1.write(number,9,company[0][2]) 103 104 sheet1.write(number,10,(" ".join(str(i) for i in welfare))) 105 106 sheet1.write(number,11,i[6]) 107 108 number+=1 109 excel1.save("51job.xls") 110 #休息间隔,避免爬取海量数据时被误判为攻击,IP遭到封禁 111 time.sleep(0.3) 112 except: 113 pass 114 except: 115 pass 116 117 #coding:utf-8 118 import pandas as pd 119 import re 120 121 #除此之外还要安装xlrd包 122 123 data = pd.read_excel(r\'51job.xls\',sheet_name=\'Job\') 124 result = pd.DataFrame(data) 125 126 a = result.dropna(axis=0,how=\'any\') 127 #输出全部行,不省略 128 pd.set_option(\'display.max_rows\',None) 129 130 b = u\'数据\' 131 number = 1 132 li = a[\'职位\'] 133 for i in range(0,len(li)): 134 135 try: 136 if b in li[i]: 137 #print(number,li[i]) 138 number+=1 139 else: 140 a = a.drop(i,axis=0) 141 except: 142 pass 143 144 b2= u\'人\' 145 li2 = a[\'学历要求\'] 146 for i in range(0,len(li2)): 147 148 try: 149 if b2 in li2[i]: 150 #print(number,li2[i]) 151 number+=1 152 a = a.drop(i,axis=0) 153 154 except: 155 pass 156 157 b3 =u\'万/年\' 158 b4 =u\'千/月\' 159 li3 = a[\'薪资\'] 160 161 #注释部分的print都是为了调试用的 162 163 for i in range(0,len(li3)): 164 try: 165 if b3 in li3[i]: 166 x = re.findall(r\'\d*\.?\d+\',li3[i]) 167 #print(x) 168 169 #转换成浮点型并保留两位小数 170 min_ = format(float(x[0])/12,\'.2f\') 171 max_ = format(float(x[1])/12,\'.2f\') 172 li3[i][1] = min_+\'-\'+max_+u\'万/月\' 173 174 if b4 in li3[i]: 175 x = re.findall(r\'\d*\.?\d+\',li3[i]) 176 #print(x) 177 178 #input() 179 min_ = format(float(x[0])/10,\'.2f\') 180 max_ = format(float(x[1])/10,\'.2f\') 181 li3[i][1] = str(min_+\'-\'+max_+\'万/月\') 182 print(i,li3[i]) 183 184 except: 185 pass 186 187 a.to_excel(\'51job2.xlsx\', sheet_name=\'Job\', index=False) 188 189 # -*- coding: utf-8 -*- 190 import pandas as pd 191 import re 192 193 from pyecharts import Funnel,Pie,Geo 194 import matplotlib.pyplot as plt 195 196 file = pd.read_excel(r\'51job2.xls\',sheet_name=\'Job\') 197 f = pd.DataFrame(file) 198 199 pd.set_option(\'display.max_rows\',None) 200 201 add = f[\'公司地点\'] 202 203 sly = f[\'薪资\'] 204 205 edu = f[\'学历要求\'] 206 207 exp = f[\'工作经验\'] 208 209 address =[] 210 211 salary = [] 212 213 education = [] 214 215 experience = [] 216 217 for i in range(0,len(f)): 218 try: 219 a = add[i].split(\'-\') 220 address.append(a[0]) 221 222 #print(address[i]) 223 s = re.findall(r\'\d*\.?\d+\',sly[i]) 224 s1= float(s[0]) 225 s2 =float(s[1]) 226 salary.append([s1,s2]) 227 228 #print(salary[i]) 229 education.append(edu[i]) 230 231 #print(education[i]) 232 experience.append(exp[i]) 233 234 #print(experience[i]) 235 except: 236 pass 237 238 #定义存放最低薪资的列表 239 min_s=[] 240 #定义存放最高薪资的列表 241 max_s=[] 242 for i in range(0,len(experience)): 243 min_s.append(salary[i][0]) 244 max_s.append(salary[i][0]) 245 246 my_df = pd.DataFrame({\'experience\':experience, \'min_salay\' : min_s, 247 #关联工作经验与薪资 248 \'max_salay\' : max_s}) 249 data1 = my_df.groupby(\'experience\').mean()[\'min_salay\'].plot(kind=\'line\') 250 plt.show() 251 252 my_df2 = pd.DataFrame({\'education\':education, \'min_salay\' : min_s, 253 #关联学历与薪资 254 \'max_salay\' : max_s}) 255 data2 = my_df2.groupby(\'education\').mean()[\'min_salay\'].plot(kind=\'line\') 256 plt.show() 257 258 def get_edu(list): 259 education2 = {} 260 for i in set(list): 261 education2[i] = list.count(i) 262 return education2 263 dir1 = get_edu(education) 264 265 # print(dir1) 266 267 attr= dir1.keys() 268 value = dir1.values() 269 pie = Pie("学历要求") 270 pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype=\'radius\', 271 is_legend_show=False, is_label_show=True,legend_orient=\'vertical\') 272 pie.render(\'学历要求玫瑰图.html\') 273 274 def get_address(list): 275 address2 = {} 276 for i in set(list): 277 address2[i] = list.count(i) 278 279 address2.pop(\'异地招聘\') 280 281 #address2.pop(\'山东\') 282 #address2.pop(\'怒江\') 283 #address2.pop(\'池州\') 284 285 return address2 286 287 dir2 = get_address(address) 288 289 #print(dir2) 290 291 geo = Geo("大数据人才需求分布图", title_color="#2E2E2E", 292 title_text_size=24,title_top=20,title_pos="center", width=1300,height=600) 293 294 attr2 = dir2.keys() 295 value2 = dir2.values() 296 297 geo.add("",attr2, value2, type="effectScatter", is_random=True, visual_range=[0, 1000], maptype=\'china\',symbol_size=8, effect_scale=5, is_visualmap=True) 298 299 geo.render(\'大数据城市需求分布图.html\') 300 301 def get_experience(list): 302 experience2 = {} 303 for i in set(list): 304 305 experience2[i] = list.count(i) 306 307 return experience2 308 309 dir3 = get_experience(experience) 310 311 #print(dir3) 312 313 attr3= dir3.keys() 314 value3 = dir3.values() 315 funnel = Funnel("工作经验漏斗图",title_pos=\'center\') 316 317 funnel.add("", attr3, value3,is_label_show=True,label_pos="inside", label_text_color="#fff",legend_orient=\'vertical\',legend_pos=\'left\') 318 319 funnel.render(\'工作经验要求漏斗图.html\')
五、总结
本次主题的爬虫因基础薄弱进行的时间较久,但结果还是好的。通过Execll文件和可视化分析可以清晰直观的了解到应聘职位的各种要求,
基本达到了想要的结果。但是pyecharts里面的图还有很多种,还是要继续慢慢发掘,加强自己的专业知识。