运用selenium、urllib抓取51job上的python岗位任职要求,形成一个txt文本:

import selenium  #测试框架
import selenium.webdriver  #模拟浏览器
import re
import urllib
import urllib.request


def geturllistsh(searchname):
    url="https://search.51job.com/list/020000,000000,0000,00,9,99,"+searchname+",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    driver=selenium.webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver") #调用火狐浏览器
    driver.get(url)  #访问链接
    pagesource=driver.page_source   #抓取网页源代码
    restr="""title\">(.*?)</span"""    #正则表达式
    regex=re.compile(restr,re.IGNORECASE)
    mylist=regex.findall(pagesource)
    driver.close()  #关闭
#getnumberbyname("python")
#num=eval(getnumberbyname("python")) #1731
#if  num%50==0:
#    pages=num//50+1
#else:
 #   pages=num//50+1
    mylist = []
    for i in range(1,130):
        newurl="https://search.51job.com/list/020000,000000,0000,00,9,99,"+searchname+",2,{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=".format(i)
        mylist.append(newurl)
    for line in mylist:
        print(line)
    return mylist

def  downloadgeturllist(url):
    headers={"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    request=urllib.request.Request(url,headers=headers)#发起请求,
    # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header
    request.add_header("Connection", "keep-alive") #一直活着
    try:
        response=urllib.request.urlopen(request)
        data=response.read().decode("gbk")#打开请求,抓取数据
        print(response.code)  # 可以查看响应状态码

        restr = "<div class=\"dw_table\" id=\"resultList\">([\s\S]*?)<!--列表表格 END-->"  # 正则表达式,()只要括号内的数据
        regex = re.compile(restr, re.IGNORECASE)
        mylist = regex.findall(data)
        #print(mylist[0])#抓取整个表格

        restr = "el title\">([\s\S]*?)<!--列表表格 END-->"  # 正则表达式,()只要括号内的数据
        regex = re.compile(restr, re.IGNORECASE)
        mylist = regex.findall(data)
        restr = "<span class=\"t5\">发布时间</span>([\s\S]*?)<!--列表表格 END-->"  # 正则表达式,()只要括号内的数据
        regex = re.compile(restr, re.IGNORECASE)
        mylist = regex.findall(data)
        #print(mylist[0])#抓取整个表格
        #returnurllist=[]  #存储url,最终返回
        for line  in mylist:
            restr = \'<a target="_blank" title=".*?" href="(.*?)"  onmousedown=".*?">[.\s\S]*?</a>\'
            regex = re.compile(restr, re.IGNORECASE)
            geturllist = regex.findall(line)
        for getlist in geturllist:
            print(getlist)
        return geturllist
    except:
        return ""

def  getworkinfo(url):
    headers={"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    request=urllib.request.Request(url,headers=headers)#发起请求,
    # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header
    request.add_header("Connection", "keep-alive") #一直活着
    try:
        response=urllib.request.urlopen(request)
        data=response.read().decode("gbk","ignore")#打开请求,抓取数据
        restr = "<div class=\"bmsg job_msg inbox\">([\s\S]*?).*?\s<div class=\"mt10\">"  # 正则表达式,()只要括号内的数据
        regex = re.compile(restr, re.IGNORECASE)
        mylist = regex.findall(data)
        if len(mylist) > 0:
            datas = mylist[0].strip().replace("</p>", "").replace("<p>", "")
            return datas
        else:
            return ""
    except:
        return ""

savefilepath="workinfo.txt"
savefile=open(savefilepath,"wb")
urllist=geturllistsh("python")  #抓取urllist
for url in urllist:
    templist=downloadgeturllist(url)
    for tempurl in templist:
        workstr=getworkinfo(tempurl)
        print(workstr)
        savefile.write((workstr+"\r\n").encode("utf-8"))

savefile.close()

 

版权声明:本文为my-global原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/my-global/p/12447356.html