一、项目目录结构:

代码如下:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class Job58CityItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job_name = scrapy.Field()
    money = scrapy.Field()
    job_wel = scrapy.Field()
    company = scrapy.Field()
    position_type = scrapy.Field()
    xueli = scrapy.Field()
    jingyan = scrapy.Field()
    address = scrapy.Field()

# -*- coding: utf-8 -*-
import scrapy
from ..items import Job58CityItem


class JobsSpider(scrapy.Spider):
    name = \'jobs\'
    allowed_domains = [\'58.com\']
    # 配置起始页url
    offset = 1
    url = "https://cd.58.com/job/pn{0}/"
    start_urls = [url.format(str(offset))]

    #解析html内容
    def parse(self, response):
        for each in response.xpath("//ul[@id=\'list_con\']/li"):
            item = Job58CityItem()
            item[\'job_name\'] = each.xpath(".//span[@class=\'name\']/text()").extract()[0]
            money_list = each.xpath(".//p[@class=\'job_salary\']/text()").extract()
            money = "未知"
            if len(money_list) > 0:
                money = money_list[0]
            item[\'money\'] = money
            span = each.xpath(".//div[@class=\'job_wel clearfix\']/span")
            item[\'job_wel\'] = []
            for i in span:
                item[\'job_wel\'].append(i.xpath("./text()").extract()[0])
            item[\'company\'] = each.xpath(".//div[@class=\'comp_name\']/a/text()").extract()[0]
            item[\'position_type\'] = each.xpath(".//span[@class=\'cate\']/text()").extract()[0]
            item[\'xueli\'] = each.xpath(".//span[@class=\'xueli\']/text()").extract()[0]
            item[\'jingyan\'] = each.xpath(".//span[@class=\'jingyan\']/text()").extract()[0]
            item[\'address\'] = each.xpath("//span[@class=\'address\']/text()").extract()[0]
            yield item
        if self.offset < 100:
            self.offset += 1
        yield scrapy.Request("https://cd.58.com/job/pn{0}/".format(str(self.offset)), callback=self.parse)
from scrapy import cmdline

if __name__ == \'__main__\':
    cmdline.execute("scrapy crawl jobs".split())

 数据:

 源码链接:https://github.com/yangsphp/Scrapy-master


版权声明:本文为yang-2018原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/yang-2018/p/10966941.html