使用python做一个爬虫GUI程序
整体思路和之前的一篇博客爬虫豆瓣美女一致,这次加入了图片分类,同时利用tkinter模块做成GUI程序
效果如下:
整体代码如下:
1 # -*- coding:utf-8 -*- 2 3 import requests 4 from requests.exceptions import RequestException 5 import tkinter as tk 6 from tkinter import ttk 7 from bs4 import BeautifulSoup 8 import bs4 9 from tkinter import * 10 from tkinter.filedialog import askdirectory 11 import os 12 13 class DB(): 14 def __init__(self): 15 self.window = tk.Tk() #创建window窗口 16 self.window.title("Crawler Pics") # 定义窗口名称 17 # self.window.resizable(0,0) # 禁止调整窗口大小 18 self.menu = ttk.Combobox(self.window,width=6) 19 self.path = StringVar() 20 self.lab1 = tk.Label(self.window, text = "目标路径:") 21 self.lab2 = tk.Label(self.window, text="选择分类:") 22 self.lab3 = tk.Label(self.window, text="爬取页数:") 23 self.page = tk.Entry(self.window, width=5) 24 self.input = tk.Entry(self.window, textvariable = self.path, width=80) # 创建一个输入框,显示图片存放路径 25 self.info = tk.Text(self.window, height=20) # 创建一个文本展示框,并设置尺寸 26 27 self.menu[\'value\'] = (\'大胸妹\',\'小翘臀\', \'黑丝袜\', \'美腿控\', \'有颜值\',\'大杂烩\') 28 self.menu.current(0) 29 30 # 添加一个按钮,用于选择图片保存路径 31 self.t_button = tk.Button(self.window, text=\'选择路径\', relief=tk.RAISED, width=8, height=1, command=self.select_Path) 32 # 添加一个按钮,用于触发爬取功能 33 self.t_button1 = tk.Button(self.window, text=\'爬取\', relief=tk.RAISED, width=8, height=1,command=self.download) 34 # 添加一个按钮,用于触发清空输出框功能 35 self.c_button2 = tk.Button(self.window, text=\'清空输出\', relief=tk.RAISED,width=8, height=1, command=self.cle) 36 37 def gui_arrang(self): 38 """完成页面元素布局,设置各部件的位置""" 39 self.lab1.grid(row=0,column=0) 40 self.lab2.grid(row=1, column=0) 41 self.menu.grid(row=1, column=1,sticky=W) 42 self.lab3.grid(row=2, column=0,padx=5,pady=5,sticky=tk.W) 43 self.page.grid(row=2, column=1,sticky=W) 44 self.input.grid(row=0,column=1) 45 self.info.grid(row=3,rowspan=5,column=0,columnspan=3,padx=15,pady=15) 46 self.t_button.grid(row=0,column=2,padx=5,pady=5,sticky=tk.W) 47 self.t_button1.grid(row=1,column=2) 48 self.c_button2.grid(row=0,column=3,padx=5,pady=5,sticky=tk.W) 49 50 def get_cid(self): 51 category = { 52 \'DX\': 2, 53 \'XQT\': 6, 54 \'HSW\': 7, 55 \'MTK\': 3, 56 \'YYZ\': 4, 57 \'DZH\': 5 58 } 59 cid = None 60 if self.menu.get() == "大胸妹": 61 cid = category["DX"] 62 elif self.menu.get() == "小翘臀": 63 cid = category["XQT"] 64 elif self.menu.get() == "黑丝袜": 65 cid = category["HSW"] 66 elif self.menu.get() == "美腿控": 67 cid = category["MTK"] 68 elif self.menu.get() == "有颜值": 69 cid = category["YYZ"] 70 elif self.menu.get() == "大杂烩": 71 cid = category["DZH"] 72 return cid 73 74 def select_Path(self): 75 """选取本地路径""" 76 path_ = askdirectory() 77 self.path.set(path_) 78 79 def get_html(self, url, header=None): 80 """请求初始url""" 81 response = requests.get(url, headers=header) 82 try: 83 if response.status_code == 200: 84 # print(response.status_code) 85 # print(response.text) 86 return response.text 87 return None 88 except RequestException: 89 print("请求失败") 90 return None 91 92 def parse_html(self, html, list_data): 93 """提取img的名称和图片url,并将名称和图片地址以字典形式返回""" 94 soup = BeautifulSoup(html, \'html.parser\') 95 img = soup.find_all(\'img\') 96 for t in img: 97 if isinstance(t, bs4.element.Tag): 98 # print(t) 99 name = t.get(\'alt\') 100 img_src = t.get(\'src\') 101 list_data.append([name, img_src]) 102 dict_data = dict(list_data) 103 return dict_data 104 105 def get_image_content(self, url): 106 """请求图片url,返回二进制内容""" 107 print("正在下载", url) 108 self.info.insert(\'end\',"正在下载:"+url+\'\n\') 109 try: 110 r = requests.get(url) 111 if r.status_code == 200: 112 return r.content 113 return None 114 except RequestException: 115 return None 116 117 def download(self): 118 base_url = \'https://www.dbmeinv.com/index.htm?\' 119 for i in range(1, int(self.page.get())+1): 120 url = base_url + \'cid=\' + str(self.get_cid()) + \'&\' + \'pager_offset=\' + str(i) 121 # print(url) 122 header = { 123 \'Accept\': \'text/html,application/xhtml+xml,application/xml;q = 0.9, image/webp,image/apng,*/*;q=\' 124 \'0.8\', 125 \'Accept-Encoding\': \'gzip,deflate,br\', 126 \'Accept-Language\': \'zh-CN,zh;q=0.9,en;q=0.8\', 127 \'Cache-Control\': \'max-age=0\', 128 \'Connection\': \'keep-alive\', 129 \'Host\': \'www.dbmeinv.com\', 130 \'Upgrade-Insecure-Requests\': \'1\', 131 \'User-Agent\': \'Mozilla/5.0(WindowsNT6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/\' 132 \'70.0.3538.102Safari/537.36 \' 133 } 134 list_data = [] 135 html = self.get_html(url) 136 # print(html) 137 dictdata = self.parse_html(html, list_data) 138 139 140 root_dir = self.input.get() 141 case_list = ["大胸妹", "小翘臀", "黑丝袜", "美腿控", "有颜值", "大杂烩"] 142 for t in case_list: 143 if not os.path.exists(root_dir + \'/pics\'): 144 os.makedirs(root_dir + \'/pics\') 145 if not os.path.exists(root_dir + \'/pics/\' + str(t)): 146 os.makedirs(root_dir + \'/pics/\' + str(t)) 147 148 149 if self.menu.get() == "大胸妹": 150 save_path = root_dir + \'/pics/\' + \'大胸妹\' 151 for t in dictdata.items(): 152 try: 153 # file_path = \'{0}/{1}.{2}\'.format(save_path, t[1], \'jpg\') 154 file_path = save_path + \'/\' + t[0] + \'q\' + \'.jpg\' 155 if not os.path.exists(file_path): # 判断是否存在文件,不存在则爬取 156 with open(file_path, \'wb\') as f: 157 f.write(self.get_image_content(t[1])) 158 f.close() 159 print(\'文件保存成功\') 160 except FileNotFoundError: 161 continue 162 163 elif self.menu.get() == "小翘臀": 164 save_path = root_dir + \'/pics/\' + \'小翘臀\' 165 for t in dictdata.items(): 166 try: 167 # file_path = \'{0}/{1}.{2}\'.format(save_path, t[1], \'jpg\') 168 file_path = save_path + \'/\' + t[0] + \'q\' + \'.jpg\' 169 if not os.path.exists(file_path): # 判断是否存在文件,不存在则爬取 170 with open(file_path, \'wb\') as f: 171 f.write(self.get_image_content(t[1])) 172 f.close() 173 print(\'文件保存成功\') 174 except FileNotFoundError: 175 continue 176 177 elif self.menu.get() == "黑丝袜": 178 save_path = root_dir + \'/pics/\' + \'黑丝袜\' 179 for t in dictdata.items(): 180 try: 181 # file_path = \'{0}/{1}.{2}\'.format(save_path, t[1], \'jpg\') 182 file_path = save_path + \'/\' + t[0] + \'q\' + \'.jpg\' 183 if not os.path.exists(file_path): # 判断是否存在文件,不存在则爬取 184 with open(file_path, \'wb\') as f: 185 f.write(self.get_image_content(t[1])) 186 f.close() 187 print(\'文件保存成功\') 188 except FileNotFoundError: 189 continue 190 191 elif self.menu.get() == "美腿控": 192 save_path = root_dir + \'/pics/\' + \'美腿控\' 193 for t in dictdata.items(): 194 try: 195 # file_path = \'{0}/{1}.{2}\'.format(save_path, t[1], \'jpg\') 196 file_path = save_path + \'/\' + t[0] + \'q\' + \'.jpg\' 197 if not os.path.exists(file_path): # 判断是否存在文件,不存在则爬取 198 with open(file_path, \'wb\') as f: 199 f.write(self.get_image_content(t[1])) 200 f.close() 201 print(\'文件保存成功\') 202 except FileNotFoundError: 203 continue 204 205 elif self.menu.get() == "有颜值": 206 save_path = root_dir + \'/pics/\' + \'有颜值\' 207 for t in dictdata.items(): 208 try: 209 # file_path = \'{0}/{1}.{2}\'.format(save_path, t[1], \'jpg\') 210 file_path = save_path + \'/\' + t[0] + \'q\' + \'.jpg\' 211 if not os.path.exists(file_path): # 判断是否存在文件,不存在则爬取 212 with open(file_path, \'wb\') as f: 213 f.write(self.get_image_content(t[1])) 214 f.close() 215 print(\'文件保存成功\') 216 except OSError: 217 continue 218 219 elif self.menu.get() == "大杂烩": 220 save_path = root_dir + \'/pics/\' + \'大杂烩\' 221 for t in dictdata.items(): 222 try: 223 # file_path = \'{0}/{1}.{2}\'.format(save_path, t[1], \'jpg\') 224 file_path = save_path + \'/\' + t[0] + \'q\' + \'.jpg\' 225 if not os.path.exists(file_path): # 判断是否存在文件,不存在则爬取 226 with open(file_path, \'wb\') as f: 227 f.write(self.get_image_content(t[1])) 228 f.close() 229 print(\'文件保存成功\') 230 except FileNotFoundError: 231 continue 232 233 def cle(self): 234 """定义一个函数,用于清空输出框的内容""" 235 self.info.delete(1.0,"end") # 从第一行清除到最后一行 236 237 238 def main(): 239 t = DB() 240 t.gui_arrang() 241 tk.mainloop() 242 243 if __name__ == \'__main__\': 244 main()
关键点:
1.如何使用tkinter调用系统路径
2.构造url,参数化图片分类、抓取页数
3.使用tkinter获取输入参数传给执行代码
下面是练习的时候写的简陋版,不包含tkinter,主要是理清思路:
1 import re 2 import requests 3 import os 4 from requests.exceptions import RequestException 5 6 case = str(input("请输入你要下载的图片分类:")) 7 category = { 8 \'DX\': 2, 9 \'XQT\': 6, 10 \'HSW\': 7, 11 \'MTK\': 3, 12 \'YYZ\': 4, 13 \'DZH\': 5 14 } 15 def get_cid(): 16 cid = None 17 if case == "大胸妹": 18 cid = category["DX"] 19 elif case == "小翘臀": 20 cid = category["XQT"] 21 elif case == "黑丝袜": 22 cid = category["HSW"] 23 elif case == "美腿控": 24 cid = category["MTK"] 25 elif case == "有颜值": 26 cid = category["YYZ"] 27 elif case == "大杂烩": 28 cid = category["DZH"] 29 return cid 30 31 32 33 base_url = \'https://www.dbmeinv.com/index.htm?\' 34 url = base_url + \'cid=\' + str(get_cid()) 35 r = requests.get(url) 36 # print(r.status_code) 37 html = r.text 38 # print(r.text) 39 # print(html) 40 41 name_pattern = re.compile(r\'<img class="height_min".*?title="(.*?)"\', re.S) 42 src_pattern = re.compile(r\'<img class="height_min".*?src="(.*?.jpg)"\', re.S) 43 44 name = name_pattern.findall(html) # 提取title 45 src = src_pattern.findall(html) # 提取src 46 data = [name,src] 47 # print(name) 48 # print(src) 49 d=[] 50 for i in range(len(name)): 51 d.append([name[i], src[i]]) 52 53 dictdata = dict(d) 54 # for i in dictdata.items(): 55 # print(i) 56 57 def get_content(url): 58 try: 59 r = requests.get(url) 60 if r.status_code == 200: 61 return r.content 62 return None 63 except RequestException: 64 return None 65 66 root_dir = os.path.dirname(os.path.abspath(\'.\')) 67 68 case_list = ["大胸妹","小翘臀","黑丝袜","美腿控","有颜值","大杂烩"] 69 for t in case_list: 70 if not os.path.exists(root_dir+\'/pics\'): 71 os.makedirs(root_dir+\'/pics\') 72 if not os.path.exists(root_dir+\'/pics/\'+str(t)): 73 os.makedirs(root_dir+\'/pics/\'+str(t)) 74 75 def Type(type): 76 save_path = root_dir + \'/pics/\' + str(type) 77 # print(save_path) 78 for t in dictdata.items(): 79 try: 80 #file_path = \'{0}/{1}.{2}\'.format(save_path, t[1], \'jpg\') 81 file_path = save_path + \'/\' + t[0]+ \'q\' +\'.jpg\' 82 print("正在下载: "+\'"\'+t[0]+\'"\'+t[1]) 83 if not os.path.exists(file_path): # 判断是否存在文件,不存在则爬取 84 with open(file_path, \'wb\') as f: 85 f.write(get_content(t[1])) 86 f.close() 87 except FileNotFoundError: 88 continue 89 if case == "大胸妹": 90 Type(case) 91 92 elif case == "小翘臀": 93 Type(case) 94 95 elif case == "黑丝袜": 96 Type(case) 97 98 elif case == "美腿控": 99 Type(case) 100 101 elif case == "有颜值": 102 Type(case) 103 104 elif case == "大杂烩": 105 Type(case)
效果如下
版权声明:本文为hanmk原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。