跑1688和淘宝货源情况
import os,xlrd,openpyxl import time import re from selenium import webdriver from time import sleep from selenium.webdriver.chrome.service import Service cwd = os.getcwd() date = time.strftime(\'%Y.%m.%d\',time.localtime(time.time())) # 获取当前日期 filepath = \'商品链接状态\'+date+\'.xlsx\' # 新建以当前日期命名表格 wb1 = openpyxl.Workbook() ws1 = wb1.active wb1.save(filepath) path = cwd + \'\\商品列表.xlsx\' path1 = cwd + \'\\\'+\'商品链接状态\'+date+\'.xlsx\' # 新建表格的路径 # print(path1) def down_data(url): #https://www.cnblogs.com/muchengnanfeng/p/9553186.html #ChromeDriver是轻量级的服务,在单任务或不需要频繁启动浏览器的情况下,使用driver.quit()关闭浏览器,可以正常结束ChromeDriver进程。若在一个比较大的 测试套件中频繁的启动关闭,会增加一个比较明显的延时导致浏览器进程不被关闭的情况发生,为了避免这一状况我们可以通过ChromeDriverService来控制ChromeDriver进程的生死,达到用完就关闭的效果避免进程占用情况出现(Running the server in a child process) c_service = Service(r\'D:\Python\Scripts\chromedriver.exe\') c_service.command_line_args() c_service.start() # 加载浏览器带表头数据爬虫 profile_directory = r\'--user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data\' option = webdriver.ChromeOptions() # option.add_argument(\'--no-sandbox\') # option.add_argument(\'--disable-dev-shm-usage\') # option.add_argument(\'--headless\') # option.add_argument(\'headless\') option.add_argument(profile_directory) driver = webdriver.Chrome(options=option) driver.get(url) sleep(3) data = driver.page_source # print(data) sleep(2) driver.quit() sleep(1) c_service.stop() return data def down_data1(url): data = down_data(url) if len(re.findall(\'class="do-purchase ms-yh " title=".*?" rel="nofollow"><span>(.*?)</span></a>\', data, re.S)): if re.findall(\'class="do-purchase ms-yh " title=".*?" rel="nofollow"><span>(.*?)</span></a>\', data, re.S)[0] == "立即订购": sation = "商品有效" print(sation) else: if len(re.findall(\'<em class="hightlight">(.*?)</em>\', data, re.S)): if re.findall(\'<em class="hightlight">(.*?)</em>\', data, re.S)[0] == "Error 404": sation = "抱歉,您要访问的页面不存在" print(sation) else: sation = "商品下架" print(sation) return sation def down_data2(url): data = down_data(url) if len(re.findall(\'class="J_LinkBuy" shortcut-key=".*?" shortcut-label="(.*?)"\', data, re.S)): if re.findall(\'class="J_LinkBuy" shortcut-key=".*?" shortcut-label="(.*?)"\', data, re.S)[0] == "立即购买": sation = "商品有效" print(sation) else: if len(re.findall(\'<div class="error-notice-hd">(.*?),\', data, re.S)): if re.findall(\'<div class="error-notice-hd">(.*?),\', data, re.S)[0] == "很抱歉": sation = "抱歉,您要访问的页面不存在" print(sation) else: sation = "商品下架" print(sation) return sation # 读取商品列表中的所有sku和链接 wb = xlrd.open_workbook(path) ws = wb.sheets()[0] sku = [] url = [] for i in range(ws.nrows): row = ws.row_values(i) sku.append(row[0].split(\'-\')[0]) url.append(row[13]) sku1 = [] url1 = [] sation1 = [] sku_url =[] for i in zip(sku,url): sku_url.append(i) print(len(sku_url),sku_url) sku_urls = list(set(sku_url)) # 唯一的sku print(len(sku_urls),sku_urls) sationa = \'商品有效\' sationb = \'抱歉,您要访问的页面不存在\' sationc = \'商品下架\' for i in range(len(sku_urls)): string = sku_urls[i][1] # 一个sku下的所有链接 zurl = string.split(\'|\') for j in range(0,len(zurl)): # 循环截取后的链接 if zurl[j][0:14] == \'https://detail\': sation = down_data1(zurl[j]) if sation == "商品有效": sku1.append(sku_urls[i][0]) url1.append(zurl[j]) sation1.append(sationa) elif sation == "抱歉,您要访问的页面不存在": sku1.append(sku_urls[i][0]) url1.append(zurl[j]) sation1.append(sationb) else: sku1.append(sku_urls[i][0]) url1.append(zurl[j]) sation1.append(sationc) elif zurl[j][0:12] == \'https://item\': sation = down_data2(zurl[j]) if sation == "商品有效": sku1.append(sku_urls[i][0]) url1.append(zurl[j]) sation1.append(sationa) elif sation == "抱歉,您要访问的页面不存在": sku1.append(sku_urls[i][0]) url1.append(zurl[j]) sation1.append(sationb) else: sku1.append(sku_urls[i][0]) url1.append(zurl[j]) sation1.append(sationc) ws1.cell(1,1).value = "sku" ws1.cell(1,2).value = "商品链接" ws1.cell(1,3).value = "链接状态" for i in range(1,len(sku1)+1): ws1.cell(i+1, 1).value = sku1[i-1] ws1.cell(i+1, 2).value = url1[i-1] ws1.cell(i+1, 3).value = sation1[i-1] wb1.save(path1)
版权声明:本文为chunfang原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。