重来
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider
import re
class
QicheSpider(CrawlSpider):
name =
‘qiche’
allowed_domains = [‘autohome.com.cn’]
# 123
start_urls = [‘https://www.autohome.com.cn/grade/carhtml/’+ i.upper() +‘_photo.html’
for i in
map(chr,range(97,99))]
# rules = (
# Rule(LinkExtractor(allow=()), callback=’parse_item’),
# )
def
parse_start_url(self, response):
brand_node_list = response.xpath(“//dl”)
for brand_node in brand_node_list:
# 商标名
brand_name = brand_node.xpath(“./dt/div/a/text()”).extract_first()
producer_list = brand_node.xpath(“./dd/div[@class=’h3-tit’]/text()”).extract()
producer_ul_node_list = brand_node.xpath(“./dd/ul”)
for producer_name, producer_ul_list in
zip(producer_list, producer_ul_node_list):