1 #爬取lol全英雄皮肤
2 import re
3 import traceback # 异常跟踪
4 import requests
5 from bs4 import BeautifulSoup
6 #获取html
7 def get_url(url, hander):
8 try:
9 r = requests.get(url, headers=hander, timeout=30)
10 r.raise_for_status()
11 r.encoding = r.apparent_encoding
12 return r.text
13 except:
14 traceback.print_exc() #将异常信息打印出来
15 return ""
16 #解析html
17 def prasing_page(lst,html):
18 try:
19 soup = BeautifulSoup(html, "html.parser")
20 for a in soup.find_all('li', class_=re.compile('boxShadow')):
21 tag_a = a('a')
22 for i in tag_a:
23 lst.append(i['href'])
24 return lst
25 except:
26 traceback.print_exc()
27 return ""
28 #解析获取到的单个html并筛选和下载
29 def getUrl_prasingpag(lst, hander):
30 hero_img_url = []
31 hero_skin_name = []
32 hero_name = []
33 for u in lst:
34 try:
35 r = requests.get(u, headers=hander, timeout=30)
36 r.raise_for_status()
37 r.encoding = r.apparent_encoding
38 #二次解析
39 soup = BeautifulSoup(r.text, "html.parser")
40 pag = soup.find_all('div', class_=re.compile('othersPifuBox'))
41 for m in pag:
42 tag_img = m('img')
43 tag_p = m('p')
44 tag_span = m('span')
45 for m in tag_p:
46 hero_skin_name.append(m.string)
47 for m in tag_img:
48 hero_img_url.append(m['src'])
49 for m in tag_span:
50 hero_name.append(m.string)
51 except:
52 traceback.print_exc() # 将异常信息打印出来
53 continue
54
#下载到本地
55 for i in range(len(hero_name)):
56 try:
57 path = 'O:/lol_hero_jpg/' + hero_skin_name[i]+'--' + hero_name[i] + '.jpg'
58 f = open(path, 'wb')
59 r = requests.get(hero_img_url[i], stream=True)
60 f.write(r.content)
61 print("\r当前进度>>>>>>>>>>>>>>>>>>{:.0f}%>>>>>>>>>>>>>>>>>>".format(i * 100 / len(lst)), end="")
62 f.close()
63 except:
64 traceback.print_exc() # 将异常信息打印出来
65 continue
66
67 def main():
68 hander = {"User-Agent":"Mozilla/5.0"}
69 deep = 43 #定义爬取页数
70 list = []
71 for i in range(deep):
72 try:
73 url = "http://********/hero_"+str(1+i)+".shtml"
74 html = get_url(url, hander)
75 prasing_page(list, html)
76 getUrl_prasingpag(list, hander)
77 except:
78 continue
79
80 main()