简单爬虫1688商品的主图和详情图
1.首先下载安装python3(window系统)
2.cmd启动控制命令
3.输入python,出现
出现上图就是安装成功了。
4.输入exit()退出python命令
5.输入D:进入D盘,创建文件python文件 和储藏爬虫下来图片images文件
在python文件建立xiang.py文件 复制下面代码
import requests
from PIL import Image
from bs4 import BeautifulSoup
def xiang(url):
circle1 = requests.get(url)
soup1 = BeautifulSoup(circle1.text, \’lxml\’)
for item in soup1.select(\’#desc-lazyload-container\’):
data_url = item.get(\’data-tfs-url\’)
print(\’A\’ + data_url )
circle = requests.get(data_url)
# 将获取的图片地址依次放入count中
count = []
soup = BeautifulSoup(circle.text, \’lxml\’)
# 将获取的网页内容放入BeautifulSoup
# 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list
for item in soup.select(\’img\’):
# print(item)
# print(type(item))
# 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
# for img in item.find_all(\’img\’):
# print(\’img\’, item)
# m 是 img标签中存在的属性
img_path = item.get(\’src\’)
img_path=img_path[2:]
img_path=img_path[:-2]
count.append(img_path)
# print(img_path)
# 用enumerate依次取出count中的图片地址 放入v中
for i,v in enumerate(count):
# 将获取的v值再次放入request中进行与网站相应
image1 = requests.get(v)
# 存取图片过程中,出现不能存储 int 类型,故而,我们对他进行类型转换 str()。w:读写方式打开,b:二进制进行读写。图片一般用到的都是二进制。
with open(\’D:\\python\\images\\img\’+str(i)+\’.jpg\’, \’wb\’) as file:
# content:图片转换成二进制,进行保存。
file.write(image1.content)
im = Image.open(\’D:\\python\\images\\img\’+str(i)+\’.jpg\’)
(x,y) = im.size #read image size
x_s = 750 #define standard width
y_s = int(y * x_s / x) #calc height based on standard width
out = im.resize((x_s,y_s),Image.ANTIALIAS) #resize image with high-quality
out.save(\’D:\\python\\images\\img\’+str(i)+\’.jpg\’)
print(i)
在python文件建立zhu.py文件 复制下面代码
import requests
from bs4 import BeautifulSoup
import json
from PIL import Image
def zhu(url):
circle = requests.get(url)
# 将获取的图片地址依次放入count中
count = []
# 将获取的网页内容放入BeautifulSoup
soup = BeautifulSoup(circle.text, \’lxml\’)
# 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list
for item in soup.select(\’.nav-tabs\’):
# print(item)
# print(type(item))
# 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
for li in item.find_all(\’li\’):
# print(\’img\’, item)
# m 是 img标签中存在的属性
img_path = li.get(\’data-imgs\’)
data2 = json.loads(img_path)
# img_path=img_path[2:]
# img_path=img_path[:-2]
count.append(data2[\’original\’])
# print(img_path)
# 用enumerate依次取出count中的图片地址 放入v中
for i,v in enumerate(count):
# 将获取的v值再次放入request中进行与网站相应
image1 = requests.get(v)
# 存取图片过程中,出现不能存储 int 类型,故而,我们对他进行类型转换 str()。w:读写方式打开,b:二进制进行读写。图片一般用到的都是二进制。
with open(\’D:\\python\\images\\zhu\’+str(i)+\’.jpg\’, \’wb\’) as file:
# content:图片转换成二进制,进行保存。
file.write(image1.content)
im = Image.open(\’D:\\python\\images\\zhu\’+str(i)+\’.jpg\’)
(x,y) = im.size #read image size
x_s = 750 #define standard width
y_s = int(y * x_s / x) #calc height based on standard width
out = im.resize((x_s,y_s),Image.ANTIALIAS) #resize image with high-quality
out.save(\’D:\\python\\images\\zhu\’+str(i)+\’.jpg\’)
print(i)
在python文件建立gui.py文件 复制下面代码
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
from tkinter import * # 导入 Tkinter 库
import tkinter.messagebox
from xiang import *
from zhu import *
root = Tk() # 创建窗口对象的背景色
root.title(“hello world”) # 创建两个列表
root.geometry(\’1000×500\’)
root.resizable(width=False,height=True)
L1 = Label(root, text=”爬虫网址:”)
L1.pack(side = LEFT)
E1 = Entry(root, bd =8,width=100)
E1.pack(side = LEFT)
def say_hi():
url = E1.get()
value = re.match(\’https://detail.1688.com/\’, url)
if value != None:
xiang(url)
zhu(url)
tkinter.messagebox.showinfo(\’提示\’, \’下载完毕\’)
else:
tkinter.messagebox.showinfo(\’提示\’, \’网址错误\’)
b1=Button(root, text=\’搜索\’, relief=\’raised\’, width=10,command=say_hi)
b1.pack(side = LEFT)
root.mainloop() # 进入消息循
一定好记的安装模块requests,bs4,PIL,json等
然后运行python.gui.py