import os,xlrd,openpyxl
import time
import re
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.service import Service

cwd = os.getcwd()
date = time.strftime(\'%Y.%m.%d\',time.localtime(time.time()))        # 获取当前日期
filepath = \'商品链接状态\'+date+\'.xlsx\'                # 新建以当前日期命名表格
wb1 = openpyxl.Workbook()
ws1 = wb1.active
wb1.save(filepath)

path = cwd + \'\\商品列表.xlsx\'
path1 = cwd + \'\\\'+\'商品链接状态\'+date+\'.xlsx\'                   # 新建表格的路径
# print(path1)
def down_data(url):
    #https://www.cnblogs.com/muchengnanfeng/p/9553186.html
    #ChromeDriver是轻量级的服务,在单任务或不需要频繁启动浏览器的情况下,使用driver.quit()关闭浏览器,可以正常结束ChromeDriver进程。若在一个比较大的 测试套件中频繁的启动关闭,会增加一个比较明显的延时导致浏览器进程不被关闭的情况发生,为了避免这一状况我们可以通过ChromeDriverService来控制ChromeDriver进程的生死,达到用完就关闭的效果避免进程占用情况出现(Running the  server in a child process)
    c_service = Service(r\'D:\Python\Scripts\chromedriver.exe\')
    c_service.command_line_args()
    c_service.start()
    # 加载浏览器带表头数据爬虫
    profile_directory = r\'--user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data\'
    option = webdriver.ChromeOptions()
    # option.add_argument(\'--no-sandbox\')
    # option.add_argument(\'--disable-dev-shm-usage\')
    # option.add_argument(\'--headless\')
    # option.add_argument(\'headless\')
    option.add_argument(profile_directory)
    driver = webdriver.Chrome(options=option)
    driver.get(url)
    sleep(3)
    data = driver.page_source
    # print(data)
    sleep(2)
    driver.quit()
    sleep(1)
    c_service.stop()
    return data

def down_data1(url):
    data = down_data(url)
    if len(re.findall(\'class="do-purchase ms-yh  " title=".*?" rel="nofollow"><span>(.*?)</span></a>\', data, re.S)):
        if re.findall(\'class="do-purchase ms-yh  " title=".*?" rel="nofollow"><span>(.*?)</span></a>\', data, re.S)[0] == "立即订购":
            sation = "商品有效"
            print(sation)
    else:
        if len(re.findall(\'<em class="hightlight">(.*?)</em>\', data, re.S)):
            if re.findall(\'<em class="hightlight">(.*?)</em>\', data, re.S)[0] == "Error 404":
                sation = "抱歉,您要访问的页面不存在"
                print(sation)
        else:
            sation = "商品下架"
            print(sation)
    return sation

def down_data2(url):
    data = down_data(url)
    if len(re.findall(\'class="J_LinkBuy" shortcut-key=".*?" shortcut-label="(.*?)"\', data, re.S)):
        if re.findall(\'class="J_LinkBuy" shortcut-key=".*?" shortcut-label="(.*?)"\', data, re.S)[0] == "立即购买":
            sation = "商品有效"
            print(sation)
    else:
        if len(re.findall(\'<div class="error-notice-hd">(.*?),\', data, re.S)):
            if re.findall(\'<div class="error-notice-hd">(.*?),\', data, re.S)[0] == "很抱歉":
                sation = "抱歉,您要访问的页面不存在"
                print(sation)
        else:
            sation = "商品下架"
            print(sation)
    return sation

# 读取商品列表中的所有sku和链接
wb = xlrd.open_workbook(path)
ws = wb.sheets()[0]
sku = []
url = []
for i in range(ws.nrows):
    row = ws.row_values(i)
    sku.append(row[0].split(\'-\')[0])
    url.append(row[13])

sku1 = []
url1 = []
sation1 = []

sku_url =[]
for i in zip(sku,url):
    sku_url.append(i)
print(len(sku_url),sku_url)
sku_urls = list(set(sku_url))           # 唯一的sku
print(len(sku_urls),sku_urls)

sationa = \'商品有效\'
sationb = \'抱歉,您要访问的页面不存在\'
sationc = \'商品下架\'

for i in range(len(sku_urls)):
    string = sku_urls[i][1]             # 一个sku下的所有链接
    zurl = string.split(\'|\')
    for j in range(0,len(zurl)):        # 循环截取后的链接
        if zurl[j][0:14] == \'https://detail\':
            sation = down_data1(zurl[j])
            if sation == "商品有效":
                sku1.append(sku_urls[i][0])
                url1.append(zurl[j])
                sation1.append(sationa)
            elif sation == "抱歉,您要访问的页面不存在":
                sku1.append(sku_urls[i][0])
                url1.append(zurl[j])
                sation1.append(sationb)
            else:
                sku1.append(sku_urls[i][0])
                url1.append(zurl[j])
                sation1.append(sationc)
        elif zurl[j][0:12] == \'https://item\':
            sation = down_data2(zurl[j])
            if sation == "商品有效":
                sku1.append(sku_urls[i][0])
                url1.append(zurl[j])
                sation1.append(sationa)
            elif sation == "抱歉,您要访问的页面不存在":
                sku1.append(sku_urls[i][0])
                url1.append(zurl[j])
                sation1.append(sationb)
            else:
                sku1.append(sku_urls[i][0])
                url1.append(zurl[j])
                sation1.append(sationc)

ws1.cell(1,1).value = "sku"
ws1.cell(1,2).value = "商品链接"
ws1.cell(1,3).value = "链接状态"

for i in range(1,len(sku1)+1):
    ws1.cell(i+1, 1).value = sku1[i-1]
    ws1.cell(i+1, 2).value = url1[i-1]
    ws1.cell(i+1, 3).value = sation1[i-1]
wb1.save(path1)

 

版权声明:本文为chunfang原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/chunfang/p/13098173.html