爬取多页数据
\'\'\' @author:zl @contact: @site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html \'\'\' # _*_ coding:utf-8 _*_ import requests from bs4 import BeautifulSoup import re import time from pymongo import MongoClient import xlwt headers = { \'user-agent\': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" , \'accept\': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", \'accept-encoding\': "gzip, deflate, br", \'accept-language\': "zh-CN,zh;q=0.9", \'cache-control\': "max-age=0", \'upgrade-insecure-requests\': "1", \'Connection\': \'keep-alive\', \'Host\': "search.51job.com", } # 获取源码 def get_content(page): url = \'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,\'+str(page)+\'.html\' req = requests.get(url,headers=headers) html = req.content.decode(\'gbk\') return html # 获取字段 def get(html): reg = re.compile(r\'<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?<span class="t2"><a target="_blank" title="(.*?)" .*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>\',re.S) items = re.findall(reg,html) return items # 爬到的内容写入excel def excel_write(items,index): for item in items: # 职位信息 for i in range(0,5): print(item[i]) ws.write(index, i, item[i]) # 行,列,数据 index+=1 if __name__ == \'__main__\': newTable = "test.xls" # 表格名称 wb = xlwt.Workbook(encoding=\'utf-8\') # 创建excel文件,声明编码 ws = wb.add_sheet(\'sheet1\') # 创建表格 headData = [\'招聘职位\', \'公司\', \'地址\', \'薪资\', \'日期\'] # 表头信息 for colnum in range(0,5): ws.write(0,colnum,headData[colnum],xlwt.easyxf(\'font: bold on\')) # 多页处理,下载到文件 for each in range(1,10): index = (each-1)*50+1 excel_write(get(get_content(each)),index) wb.save(newTable)
版权声明:本文为zhanglin123原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。