Python.51job
"""
https://www.51job.com/
"""
import requests
import requests.adapters
import string
import urllib.parse
from lxml import etree
import redis
from pyquery import PyQuery as pq
import multiprocessing
import pymongo
import datetime
import xlsxwriter
retries = 5 # 最大尝试次数
redis_key_page = \'redis_key_page\'
redis_key_detail = \'redis_key_detail\'
def get_url_txt(url, headers, encoding, data=None):
ret = \'\'
try:
requests.adapters.DEFAULT_RETRIES = 5
session = requests.session()
session.keep_alive = False
if data is None:
response = session.get(url, headers=headers)
else:
response = session.get(url, headers=headers, data=data)
if response.status_code == 200:
response.encoding = encoding
ret = response.text
response.close()
session.close()
except Exception as e:
print(e)
return ret
def encode_url(keys):
try:
keys_str = urllib.parse.quote(keys, safe=string.printable)
keys_str_ = \'\'
for i in keys_str.split(\'%\'):
if len(i):
keys_str_ += (\'%25\' + i)
return keys_str_
except Exception as e:
print(e)
return \'\'
def get_page_count(keys):
try:
keys_str = encode_url(keys)
url1 = \'https://search.51job.com/list/030000,000000,0000,00,9,99,\'
url2 = keys_str
url3 = \',2,\'
url4 = str(1)
url5 = \'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=\'
url = url1 + url2 + url3 + url4 + url5
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\'}
encoding = \'gbk\'
text = get_url_txt(url=url, headers=headers, encoding=encoding)
# print(text)
xpath = etree.HTML(text)
page_count = \'\'.join(xpath.xpath(\'//*[@id="resultList"]/div[2]/div[5]//text()\'))
page_count = int(page_count.split(\'/\')[1].replace(\' \', \'\'))
# print(page_count)
return page_count
except Exception as e:
print(e)
return 0
def flush_page_href():
try:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).delete(
redis_key_page)
except Exception as e:
print(e)
def get_page_href(keys):
try:
flush_page_href()
page_count = get_page_count(keys)
keys_str = encode_url(keys)
url1 = \'https://search.51job.com/list/030000,000000,0000,00,9,99,\'
url2 = keys_str
url3 = \',2,\'
url5 = \'.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=\'
for i in range(page_count):
url4 = str(1 + i)
url = url1 + url2 + url3 + url4 + url5
redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).rpush(
redis_key_page, \'0|\' + url)
except Exception as e:
print(e)
def flush_detail_href():
try:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).delete(
redis_key_detail)
except Exception as e:
print(e)
def get_detail_href_(redis_value):
try:
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\'}
encoding = \'gbk\'
conn_count = int(redis_value.split(\'|\')[0])
page_url = redis_value.split(\'|\')[1]
text = get_url_txt(url=page_url, headers=headers, encoding=encoding)
if len(text):
doc1 = pq(text)
doc2 = doc1(\'.t1\')
# print(doc2)
for i in doc2:
title = pq(i).find(\'a\').attr(\'title\')
href = pq(i).find(\'a\').attr(\'href\')
if title is None and href is None:
continue
# print(title, href)
redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).rpush(
redis_key_detail, \'0|\' + href)
elif conn_count < retries:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).rpush(
redis_key_page, str(conn_count + 1) + \'|\' + page_url)
except Exception as e:
print(e)
def get_detail_href():
try:
flush_detail_href()
while redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).llen(
redis_key_page) > 0:
p = multiprocessing.Pool()
while True:
redis_value = redis.StrictRedis(
connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=0)).lpop(
redis_key_page)
if redis_value is None:
break
redis_value = redis_value.decode(encoding=\'utf8\', errors=\'ignore\')
# print(redis_value)
p.apply_async(get_detail_href_, (redis_value,))
p.close()
p.join()
except Exception as e:
print(e)
def get_detail_info_(redis_value):
company, position, salary, require, contack, introduce = \'\', \'\', \'\', \'\', \'\', \'\'
try:
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\'}
encoding = \'gbk\'
conn_count = int(redis_value.split(\'|\')[0])
detail_href = redis_value.split(\'|\')[1]
text = get_url_txt(url=detail_href, headers=headers, encoding=encoding)
if len(text):
xpath = etree.HTML(text)
company = xpath.xpath(\'/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]//text()\')
position = xpath.xpath(\'/html/body/div[3]/div[2]/div[2]/div/div[1]/h1//text()\')
salary = xpath.xpath(\'/html/body/div[3]/div[2]/div[2]/div/div[1]/strong//text()\')
require = xpath.xpath(\'/html/body/div[3]/div[2]/div[3]/div[1]//text()\')
contack = xpath.xpath(\'/html/body/div[3]/div[2]/div[3]/div[2]//text()\')
introduce = xpath.xpath(\'/html/body/div[3]/div[2]/div[3]/div[3]//text()\')
company = \'\'.join(company).strip()
position = \'\'.join(position).strip()
salary = \'\'.join(salary).strip()
require = \'\'.join(require).replace(\'\r\n\', \'\').strip()
contack = \'\'.join(contack).replace(\'\r\n\', \'\').strip()
introduce = \'\'.join(introduce).replace(\'\r\n\', \'\').strip()
# print(company, position, salary, require, contack, introduce)
pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].insert_one(
{\'company\': company,
\'position\': position,
\'salary\': salary,
\'require\': require,
\'contack\': contack,
\'introduce\': introduce, })
elif conn_count < retries:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).rpush(
redis_key_detail, str(conn_count + 1) + \'|\' + detail_href)
except Exception as e:
print(e)
return company, position, salary, require, contack, introduce
def get_detail_info():
try:
while redis.StrictRedis(connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).llen(
redis_key_detail) > 0:
p = multiprocessing.Pool()
while True:
redis_value = redis.StrictRedis(
connection_pool=redis.ConnectionPool(host=\'127.0.0.1\', port=6379, db=1)).lpop(
redis_key_detail)
if redis_value is None:
break
redis_value = redis_value.decode(encoding=\'utf8\', errors=\'ignore\')
# print(redis_value)
p.apply_async(get_detail_info_, (redis_value,))
p.close()
p.join()
except Exception as e:
print(e)
if __name__ == \'__main__\':
pass
# headers = {
# \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\'}
# encoding = \'gbk\'
# print(get_url_txt(\'https://jobs.51job.com/guangzhou-thq/114903412.html?s=01&t=0\', headers, encoding))
pass
# print(encode_url(\'逆向\'))
# print(get_page_count(\'逆向\'))
# get_page_href(\'逆向\')
# get_detail_href_(
# \'https://search.51job.com/list/030000,000000,0000,00,9,99,%25E9%2580%2586%25E5%2590%2591,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=\')
# get_detail_href()
# print(get_detail_info_(\'https://jobs.51job.com/guangzhou-thq/119363494.html?s=01&t=0\'))
# get_detail_info()
# pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].insert_one({\'k1\': \'v1\', \'k2\': \'v2\'})
pass
# start = datetime.datetime.now()
# print(start.strftime(\'%Y-%m-%d %H:%M:%S\'))
# pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].drop()
# get_page_href(\'逆向\')
# get_detail_href()
# get_detail_info()
# end = datetime.datetime.now()
# print(end.strftime(\'%Y-%m-%d %H:%M:%S\'))
# print(\'cost seconds : %d\' % (end - start).seconds)
pass
cols = len(pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].find_one())
book = xlsxwriter.Workbook(\'query.xlsx\')
sheet = book.add_worksheet(\'sheet1\')
row = 0
for i in pymongo.MongoClient(\'localhost:27017\')[\'db\'][\'table\'].find({}):
sheet.write(row, 0, i[\'company\'])
sheet.write(row, 1, i[\'position\'])
sheet.write(row, 2, i[\'salary\'])
sheet.write(row, 3, i[\'require\'])
sheet.write(row, 4, i[\'contack\'])
sheet.write(row, 5, i[\'introduce\'])
row += 1
book.close()
pass
版权声明:本文为dailycode原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。