爬取多页数据

zhanglin123 2021-08-27 原文

\'\'\'
@author:zl
@contact:
@site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html
\'\'\'
# _*_ coding:utf-8 _*_
import requests
from bs4 import BeautifulSoup
import re
import time
from pymongo import MongoClient
import xlwt
headers = {
    \'user-agent\': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" ,
    \'accept\': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    \'accept-encoding\': "gzip, deflate, br",
    \'accept-language\': "zh-CN,zh;q=0.9",
    \'cache-control\': "max-age=0",
    \'upgrade-insecure-requests\': "1",
    \'Connection\': \'keep-alive\',
    \'Host\': "search.51job.com",

}
# 获取源码
def get_content(page):
    url = \'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,\'+str(page)+\'.html\'
    req = requests.get(url,headers=headers)
    html = req.content.decode(\'gbk\')
    return html
# 获取字段
def get(html):
    reg = re.compile(r\'<p class="t1 ">.*?<a target="_blank" title="(.*?)" .*?<span class="t2"><a target="_blank" title="(.*?)" .*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>\',re.S)
    items = re.findall(reg,html)
    return items
# 爬到的内容写入excel
def excel_write(items,index):
    for item in items: # 职位信息
        for i in range(0,5):
            print(item[i])
            ws.write(index, i, item[i])  # 行，列，数据
        index+=1
if __name__ == \'__main__\':
    newTable = "test.xls"  # 表格名称
    wb = xlwt.Workbook(encoding=\'utf-8\')  # 创建excel文件，声明编码
    ws = wb.add_sheet(\'sheet1\')  # 创建表格
    headData = [\'招聘职位\', \'公司\', \'地址\', \'薪资\', \'日期\']  # 表头信息
    for colnum in range(0,5):
        ws.write(0,colnum,headData[colnum],xlwt.easyxf(\'font: bold on\'))
    # 多页处理，下载到文件
    for each in range(1,10):
        index = (each-1)*50+1
        excel_write(get(get_content(each)),index)
    wb.save(newTable)