利用python爬取点小图片,满足私欲(爬虫)
1 import requests 2 import re 3 import os,sys 4 5 6 7 8 def get_url(page,headers): 9 url=\'http://www.zbjuran.com/mei/xinggan/list_13_%s.html\'%(page) 10 data=requests.get(url,headers=headers).text 11 data_use=re.findall(\'<div class="name"><a target="_blank" href=".*?" title=".*?</a></div>\',data) 12 for use in data_use: 13 link=\'http://www.zbjuran.com/\'+use.split(\'href="\')[1].split(\'" title\')[0] 14 links.append(link) 15 title=use.split(\'title="\')[1].split(\'">\')[0] 16 titles.append(title) 17 mkpath=\'/Users/b1ancheng/mzpc/%s\'%title 18 def get_pic(): 19 url_data=requests.get(link).text 20 print(link) 21 try: 22 link_page = int(url_data.split(\'<div class="page"><li><a>共\')[1].split(\'页:\')[0]) 23 for i in range(1, link_page + 1): 24 print(\'正在下载第%s页\'%i) 25 try: 26 pic_url = (link[:-5] + \'_%s\' + link[-5:])%i 27 print(pic_url) 28 try: 29 pic_data_link=\'http://www.zbjuran.com\'+requests.get(pic_url,headers=headers).text.split(\'<img alt="" src="\')[1].split(\'" /></div>\')[0] 30 with open(\'/Users/b1ancheng/mzpc/%s/%s_%s.JPG\' % (title, title,i),\'wb\') as pic_download: 31 pic_download.write(requests.get(pic_data_link).content) 32 except Exception as otherdown: 33 print(otherdown) 34 pic_data_link = \'http://www.zbjuran.com\' + requests.get(pic_url, headers=headers).text.split(\'<img src="\')[1].split(\'" /></div>\')[0] 35 with open(\'/Users/b1ancheng/mzpc/%s/%s_%s.JPG\' % (title, title,i),\'wb\') as pic_download: 36 pic_download.write(requests.get(pic_data_link).content) 37 continue 38 except Exception as error: 39 print(error) 40 continue 41 except Exception as e1: 42 print(e1) 43 os.rmdir(mkpath) 44 pass 45 # 创建目录 //可修改进get_pic 46 isExists = os.path.exists(mkpath) 47 if not isExists: 48 os.makedirs(mkpath) 49 get_pic() 50 else: 51 return False 52 if __name__ == \'__main__\': 53 headers = { 54 \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36\', 55 \'Host\': \'www.zbjuran.com\', 56 \'Cookie\': \'UM_distinctid=15ef9964528386-07264d76850875-31657c00-13c680-15ef9964529361; CNZZDATA1264461841=1179231757-1507422986-null%7C1508056601\' 57 } 58 links = [] 59 titles = [] 60 for page in range(1,88): 61 get_url(page,headers=headers)
望兄多提意见,相互进步
版权声明:本文为b1ancheng原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。