"""
https://www.51job.com/
"""

import requests
import requests.adapters
import string
import urllib.parse
from lxml import etree
import redis
from pyquery import PyQuery as pq
import multiprocessing
import pymongo
import datetime
import xlsxwriter

retries = 5  # 最大尝试次数
redis_key_page = \'redis_key_page\'
redis_key_detail = \'redis_key_detail\'


def get_url_txt(url, headers, encoding, data=None):
    ret = \'\'
    try:
        requests.adapters.DEFAULT_RETRIES = 5
        session = requests.session()
        session.keep_alive = False
        if data is None:
            response = session.get(url, headers=headers)
        else:
            response = session.get(url, headers=headers, data=data)
        if response.status_code == 200:
            response.encoding = encoding
            ret = response.text
        response.close()
        session.close()
    except Exception as e:
        print(e)
    return ret


def encode_url(keys):
    try:
        keys_str = urllib.parse.quote(keys, safe=string.printable)
        keys_str_ = \'\'
        for i in keys_str.split(\'%\'):
            if len(i):
                keys_str_ += (\'%25\' + i)
        return keys_str_
    except Exception as e:
        print(e)
    return \'\'


def get_page_count(keys):
    try:
        keys_str = encode_url(keys)
        url1 = \'https://search.51job.com/list/030000,000000,0000,00,9,99,\'
        url2 = keys_str
        url3 = \',2,\'
        url4 = str(1)
        url5 = \'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=\'
        url = url1 + url2 + url3 + url4 + url5
        headers = {
            \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\'}
        encoding = \'gbk\'
        text = get_url_txt(url=url, headers=headers, encoding=encoding)
        # print(text)
        xpath = etree.HTML(text)
        page_count = \'\'.join(xpath.xpath(\'//*[@id="resultList"]/div[2]/div[5]//text()\'))
        page_count = int(page_count.split(\'/\')[1].replace(\' \', \'\'))
        # print(page_count)
        return page_count
    except Exception as e:
        print(e)
    return 0


def flush_page_href():
    try:
        redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).delete(
            redis_key_page)
    except Exception as e:
        print(e)


def get_page_href(keys):
    try:
        flush_page_href()
        page_count = get_page_count(keys)
        keys_str = encode_url(keys)
        url1 = \'https://search.51job.com/list/030000,000000,0000,00,9,99,\'
        url2 = keys_str
        url3 = \',2,\'
        url5 = \'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=\'
        for i in range(page_count):
            url4 = str(1 + i)
            url = url1 + url2 + url3 + url4 + url5
            redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).rpush(
                redis_key_page, \'0|\' + url)
    except Exception as e:
        print(e)


def flush_detail_href():
    try:
        redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).delete(
            redis_key_detail)
    except Exception as e:
        print(e)


def get_detail_href_(redis_value):
    try:
        headers = {
            \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\'}
        encoding = \'gbk\'
        conn_count = int(redis_value.split(\'|\')[0])
        page_url = redis_value.split(\'|\')[1]
        text = get_url_txt(url=page_url, headers=headers, encoding=encoding)
        if len(text):
            doc1 = pq(text)
            doc2 = doc1(\'.t1\')
            # print(doc2)
            for i in doc2:
                title = pq(i).find(\'a\').attr(\'title\')
                href = pq(i).find(\'a\').attr(\'href\')
                if title is None and href is None:
                    continue
                # print(title, href)
                redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).rpush(
                    redis_key_detail, \'0|\' + href)
        elif conn_count < retries:
            redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).rpush(
                redis_key_page, str(conn_count + 1) + \'|\' + page_url)
    except Exception as e:
        print(e)


def get_detail_href():
    try:
        flush_detail_href()
        while redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).llen(
                redis_key_page) > 0:
            p = multiprocessing.Pool()
            while True:
                redis_value = redis.StrictRedis(
                    connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).lpop(
                    redis_key_page)
                if redis_value is None:
                    break
                redis_value = redis_value.decode(encoding=\'utf8\', errors=\'ignore\')
                # print(redis_value)
                p.apply_async(get_detail_href_, (redis_value,))
            p.close()
            p.join()
    except Exception as e:
        print(e)


def get_detail_info_(redis_value):
    company, position, salary, require, contack, introduce = \'\', \'\', \'\', \'\', \'\', \'\'
    try:
        headers = {
            \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\'}
        encoding = \'gbk\'
        conn_count = int(redis_value.split(\'|\')[0])
        detail_href = redis_value.split(\'|\')[1]
        text = get_url_txt(url=detail_href, headers=headers, encoding=encoding)
        if len(text):
            xpath = etree.HTML(text)
            company = xpath.xpath(\'/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]//text()\')
            position = xpath.xpath(\'/html/body/div[3]/div[2]/div[2]/div/div[1]/h1//text()\')
            salary = xpath.xpath(\'/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()\')
            require = xpath.xpath(\'/html/body/div[3]/div[2]/div[3]/div[1]//text()\')
            contack = xpath.xpath(\'/html/body/div[3]/div[2]/div[3]/div[2]//text()\')
            introduce = xpath.xpath(\'/html/body/div[3]/div[2]/div[3]/div[3]//text()\')
            company = \'\'.join(company).strip()
            position = \'\'.join(position).strip()
            salary = \'\'.join(salary).strip()
            require = \'\'.join(require).replace(\'\r\n\', \'\').strip()
            contack = \'\'.join(contack).replace(\'\r\n\', \'\').strip()
            introduce = \'\'.join(introduce).replace(\'\r\n\', \'\').strip()
            # print(company, position, salary, require, contack, introduce)
            pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].insert_one(
                {\'company\': company,
                 \'position\': position,
                 \'salary\': salary,
                 \'require\': require,
                 \'contack\': contack,
                 \'introduce\': introduce, })
        elif conn_count < retries:
            redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).rpush(
                redis_key_detail, str(conn_count + 1) + \'|\' + detail_href)
    except Exception as e:
        print(e)
    return company, position, salary, require, contack, introduce


def get_detail_info():
    try:
        while redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).llen(
                redis_key_detail) > 0:
            p = multiprocessing.Pool()
            while True:
                redis_value = redis.StrictRedis(
                    connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).lpop(
                    redis_key_detail)
                if redis_value is None:
                    break
                redis_value = redis_value.decode(encoding=\'utf8\', errors=\'ignore\')
                # print(redis_value)
                p.apply_async(get_detail_info_, (redis_value,))
            p.close()
            p.join()
    except Exception as e:
        print(e)


if __name__ == \'__main__\':
    pass
    # headers = {
    #     \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\'}
    # encoding = \'gbk\'
    # print(get_url_txt(\'https://jobs.51job.com/guangzhou-thq/114903412.html?s=01&t=0\', headers, encoding))
    pass
    # print(encode_url(\'逆向\'))
    # print(get_page_count(\'逆向\'))
    # get_page_href(\'逆向\')
    # get_detail_href_(
    #     \'https://search.51job.com/list/030000,000000,0000,00,9,99,%25E9%2580%2586%25E5%2590%2591,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=\')
    # get_detail_href()
    # print(get_detail_info_(\'https://jobs.51job.com/guangzhou-thq/119363494.html?s=01&t=0\'))
    # get_detail_info()
    # pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].insert_one({\'k1\': \'v1\', \'k2\': \'v2\'})
    pass
    # start = datetime.datetime.now()
    # print(start.strftime(\'%Y-%m-%d %H:%M:%S\'))
    # pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].drop()
    # get_page_href(\'逆向\')
    # get_detail_href()
    # get_detail_info()
    # end = datetime.datetime.now()
    # print(end.strftime(\'%Y-%m-%d %H:%M:%S\'))
    # print(\'cost seconds : %d\' % (end - start).seconds)
    pass
    cols = len(pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].find_one())
    book = xlsxwriter.Workbook(\'query.xlsx\')
    sheet = book.add_worksheet(\'sheet1\')
    row = 0
    for i in pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].find({}):
        sheet.write(row, 0, i[\'company\'])
        sheet.write(row, 1, i[\'position\'])
        sheet.write(row, 2, i[\'salary\'])
        sheet.write(row, 3, i[\'require\'])
        sheet.write(row, 4, i[\'contack\'])
        sheet.write(row, 5, i[\'introduce\'])
        row += 1
    book.close()
    pass

版权声明:本文为dailycode原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/dailycode/p/12495587.html