BeautifulSoup模块爬图学习HTML文本解析标签定位
网上教程多是爬mzitu,此网站反爬限制多了。随意找了个网址,解析速度有些慢。
脚本流程:首页获取总页数-->拼接每页URL-->获取每页中所有主题URL-->遍历图片源URL下载,保存
  1 #python3
  2 #coding:utf-8_
  3 #_author: Jack
  4 #_date: 2020/3/28
  5 
  6 from bs4 import BeautifulSoup
  7 import requests,os,sys,time
  8 
  9 DIR_PATH = os.path.dirname(os.path.abspath(__file__))
 10 sys.path.append(DIR_PATH)
 11 
 12 
 13 HEADER = {
 14         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:74.0) Gecko/20100101 Firefox/74.0',
 15        }
 16 
 17 def create_dir(file_path):
 18     '''
 19     :param file_path: images_directory
 20     :return:
 21     '''
 22     if not os.path.exists(file_path):
 23         os.mkdir(file_path)
 24         print('Creatr directory:',file_path)
 25     os.chdir(file_path) # cd ..
 26 
 27 def save_data(src,dir_name,file_name):
 28     '''
 29     :param src: images url
 30     :param sum: directory name
 31     :param file_name: image name
 32     :return:
 33     '''
 34     file_path = os.path.join(DIR_PATH,'images',str(dir_name))  #directory path
 35     image_path = os.path.join(file_path,file_name)  #images path
 36     create_dir(file_path)
 37 
 38     if not os.path.isfile(image_path):
 39         req = requests.get(src,headers=HEADER)
 40         with open(image_path, 'wb') as f_save:
 41             f_save.write(req.content)
 42             print('Download successful:',file_name)
 43             f_save.flush()
 44     else:
 45         print('File already exists! Pass')
 46 
 47 def request_to_url(url,header):
 48     '''
 49     :param url: page_url
 50     :param head: request.header
 51     :return: respond.text
 52     '''
 53     res = requests.get(url,headers=header)
 54     return res.text
 55 
 56 def soup(url,header):
 57     '''
 58     :param url:
 59     :param header:
 60     :return: HTML_Tag
 61     '''
 62     return BeautifulSoup(request_to_url(url,header),'html.parser')
 63 
 64 def action(url):
 65     '''
 66     Download a count of 100 images and create a new folder
 67     :param url: URL
 68     :return:
 69     '''
 70     download_count = 0
 71     dir_name =100
 72     try:
 73         page_tag = soup(url,HEADER).find('div',class_='pg').find_all('a')
 74         max_page = int(page_tag[-2].text.split(' ')[-1])
 75 
 76         for i in range(1,max_page+1):   #find page
 77             page_url = os.path.join(url,'forum.php?order=&fid=0&page=%d'%i)
 78             #time.sleep(1)
 79             page_all_theme_list = soup(page_url,HEADER).find('div',class_='kind_show')
 80             theme_list = page_all_theme_list.find_all('div', class_='photo_thumb kind_left')
 81 
 82             for i in theme_list:    #find theme
 83                 theme = i.find('div', class_='title').find('a')
 84                 #title = theme.string
 85                 img_url = theme.get('href')
 86                 print("Ready download: %s" % theme.string,img_url)
 87                 # time.sleep(1)
 88                 img_page_tag = soup(img_url,HEADER).find('td',class_='t_f').find_all('img')
 89 
 90                 for i in img_page_tag:  #find image
 91                     try:
 92                         img_src = i.get('src')
 93                         if isinstance(download_count %100,float):
 94                             dir_name +=100
 95                         save_data(img_src,dir_name,img_src.split('/')[-1])
 96                         download_count += 1
 97                         print('Download successful: %d' %download_count)
 98 
 99                     except Exception as e:
100                         print('Img_tag & Save_data Error:',e)
101                         continue
102 
103     except Exception as e:
104         print('The trunk Error:',e)
105 
106 if __name__ == '__main__':
107     print('Run.....')
108     URL = 'http://www.lesb.cc/'
109     action(URL)
110     print('Perform !')

 

 

版权声明:本文为jackron原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/jackron/p/12593975.html