Python爬取前程无忧网站上python的招聘信息
文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。
作者: 我姓刘却留不住你的心
PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取
http://note.youdao.com/noteshare?id=3054cce4add8a909e784ad934f956cef
本文获取的字段有为职位名称,公司名称,公司地点,薪资,发布时间
创建爬虫项目
scrapy startproject qianchengwuyou
cd qianchengwuyou
scrapy genspider -t crawl qcwy www.xxx.com
items中定义爬取的字段
1 import scrapy 2 3 4 class QianchengwuyouItem(scrapy.Item): 5 # define the fields for your item here like: 6 job_title = scrapy.Field() 7 company_name = scrapy.Field() 8 company_address = scrapy.Field() 9 salary = scrapy.Field()
release_time = scrapy.Field()
qcwy.py文件内写主程序
1 import scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from qianchengwuyou.items import QianchengwuyouItem 5 6 class QcwySpider(CrawlSpider): 7 name = \'qcwy\' 8 # allowed_domains = [\'www.xxx.com\'] 9 start_urls = [\'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?\'] 10 # https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,7.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare= 11 rules = ( 12 Rule(LinkExtractor(allow=r\'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,(\d+).html?\'), callback=\'parse_item\', follow=True), 13 ) 14 15 def parse_item(self, response): 16 17 list_job = response.xpath(\'//div[@id="resultList"]/div[@class="el"][position()>1]\') 18 for job in list_job: 19 item = QianchengwuyouItem() 20 item[\'job_title\'] = job.xpath(\'./p/span/a/@title\').extract_first() 21 item[\'company_name\'] = job.xpath(\'./span[1]/a/@title\').extract_first() 22 item[\'company_address\'] = job.xpath(\'./span[2]/text()\').extract_first() 23 item[\'salary\'] = job.xpath(\'./span[3]/text()\').extract_first() 24 item[\'release_time\'] = job.xpath(\'./span[4]/text()\').extract_first() 25 yield item
pipelines.py文件中写下载规则
1 import pymysql 2 3 class QianchengwuyouPipeline(object): 4 conn = None 5 mycursor = None 6 7 def open_spider(self, spider): 8 print(\'链接数据库...\') 9 self.conn = pymysql.connect(host=\'172.16.25.4\', user=\'root\', password=\'root\', db=\'scrapy\') 10 self.mycursor = self.conn.cursor() 11 12 def process_item(self, item, spider): 13 print(\'正在写数据库...\') 14 job_title = item[\'job_title\'] 15 company_name = item[\'company_name\'] 16 company_address = item[\'company_address\'] 17 salary = item[\'salary\'] 18 release_time = item[\'release_time\'] 19 sql = \'insert into qcwy VALUES (null,"%s","%s","%s","%s","%s")\' % ( 20 job_title, company_name, company_address, salary, release_time) 21 bool = self.mycursor.execute(sql) 22 self.conn.commit() 23 return item 24 25 def close_spider(self, spider): 26 print(\'写入数据库完成...\') 27 self.mycursor.close() 28 self.conn.close()
settings.py文件中打开下载管道和请求头
ITEM_PIPELINES = { \'qianchengwuyou.pipelines.QianchengwuyouPipeline\': 300, } USER_AGENT = \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2\'
运行爬虫,同时写入.json文件
scrapy crawl qcwy -o qcwy.json --nolog
查看数据库是否写入成功,