炒鸡弱的一只亚马逊spider

阅读本文需要一点Python基础,建议大佬绕过…

要爬取的页面(亚马逊搜索页面):

 

 

要取的内容:

代码:

import requests
import re
import json

from bs4 import BeautifulSoup
from urllib.parse import urljoin


headers = {
        \'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36\'
}
#用于伪造请求头
original_url = \'https://www.amazon.cn\'


def get_onePage(key,page):
    url = "https://www.amazon.cn/s?k="+key+\'&page=\'+str(page)
    #k后面的为搜索的关键字
    r = requests.session()
    #利用requests.session()去请求页面可以记住cookies
    res = r.get(url=url,headers=headers)
    res.encoding = res.apparent_encoding
    #设置编码为requests从内容中分析出的编码方式而不是网站给出的编码
    soup = BeautifulSoup(res.text,"lxml")
    result = soup.find_all(name=\'div\',attrs={\'data-index\':True,\'class\':re.compile("sg-col-4-of-16 sg-col")})
    onePage = {}
    temp = {}

    for i in result:
        \'\'\'
        这里是获取详情,跟网页的结构有关,其实不必太在意这部分
        \'\'\'
        title = i.find(name="span",attrs={\'class\':\'a-size-base-plus a-color-base a-text-normal\'})
        title = title.string if title  else \'\'
        #标题
        imgUrl = i.find(name="img", attrs={\'class\': \'s-image\'})
        imgUrl = imgUrl.attrs[\'src\'] if imgUrl else \'\'
        #商品图片链接
        price = i.find(name="span", attrs={\'class\': \'a-offscreen\'})
        price = price.string if price else \'\'
        #商品价格
        star = i.find(name="span", attrs={\'class\': \'a-icon-alt\'})
        star = star.string if star else \'\'
        #星星
        numberOfStar = i.find(name="span", attrs={\'class\': \'a-size-base\'})
        numberOfStar = numberOfStar.string if numberOfStar else \'\'
        #(给星星的)评价的人数
        selfSupport =  i.find(name="span", attrs={\'class\': \'s-self-operated aok-align-bottom aok-inline-block a-text-normal\'})
        selfSupport = selfSupport.string if selfSupport else \'\'
        #是否是自营
        detailUrl = i .find(name=\'a\',attrs={\'class\':\'a-link-normal a-text-normal\'})
        detailUrl = urljoin(original_url,detailUrl.attrs[\'href\'])if detailUrl else \'\'
        #商品详情链接
        onePage[detailUrl] = [{\'title\':title},{\'imgUrl\':imgUrl},{\'price\':price},{\'star\':star},{\'numberOfStar\':numberOfStar},{\'selfSupport\':selfSupport}]
    return onePage


for i in range(1,11):
    #没有登录的话亚马逊只给出10页的匹配到关键字的商品
    content = get_onePage(\'电脑\', i)
    with open(\'amazon.json\',\'a\',encoding=\'utf-8\') as f:
        json.dump(content, f, ensure_ascii=False, sort_keys=True, indent=4)
        #获取的内容格式写入json文件

 

 

保存的结果:

版权声明:本文为byadmin原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/byadmin/articles/spider-1.html