python爬取哦漫画
python爬取哦漫画
1 import requests 2 from lxml import etree 3 from bs4 import BeautifulSoup 4 import os 5 from selenium import webdriver 6 7 8 9 #解析每个漫画分页并下载漫画 10 def manhua(url): 11 12 13 browser.get(url) 14 15 #获取模拟访问的页面源码 16 html=browser.page_source 17 18 19 html = etree.HTML(html) 20 img_url = html.xpath('//img[@id="mangaFile"]/@src')[0] 21 alt = html.xpath('/html/body/div[2]/div[2]/h1/a/text()')[0] 22 title = html.xpath('/html/body/div[2]/div[2]/h2/text()')[0] 23 print(img_url,alt,title) 24 25 26 # print(html) 27 28 29 path='./漫画/'+alt+'/'+title+'/' 30 if not os.path.exists(path): 31 os.makedirs(path) 32 fname=img_url.split('/')[-1] 33 # print(fname) 34 35 36 print(os.path.join(path,fname)) 37 38 # request.urlretrieve(img_url,os.path.join(path,fname)) 39 40 #请求图片地址 41 response = requests.get(img_url) 42 #二进制解码 43 data= response.content 44 #保存文件 45 with open(path+fname,'wb') as f: 46 f.write(data) 47 #解析获取漫画分页链接 48 def manhua_url(url): 49 response = requests.get(url) 50 response.encoding = response.apparent_encoding 51 html = response.text 52 html = etree.HTML(html) 53 # print(html) 54 #i为漫画页数 55 i = html.xpath('/html/body/div[2]/div[2]/span/text()')[1][1:-1] 56 i=int(i) 57 # print(i) 58 #找到分页规律 59 #拼接分页链接,选择用format函数 60 url = url +'/index.html?p={}' 61 # print(url) 62 for n in range(1,i+1): 63 fullurl = url.format(n) 64 print(fullurl) 65 # time.sleep(2) 66 #fullurl为所有的分页漫画链接 67 manhua(fullurl) 68 69 #解析列表页 70 def list(lb_url): 71 response = requests.get(lb_url) 72 response.encoding = response.apparent_encoding 73 html = response.text 74 html = BeautifulSoup(html,'lxml') 75 #匹配所有章节链接 76 url_list = html.select('div.subBookList ul li') 77 for url in url_list : 78 url = url.select('a')[0].get('href').split('/')[-2] 79 80 # print(url) 81 fullurl = os.path.join(lb_url,url) 82 print(fullurl) 83 #章节链接 84 manhua_url(fullurl) 85 86 # print(url_list) 87 # print(html) 88 89 #解析首页 90 def shouye(): 91 #首页链接 92 base_url = 'http://www.omanhua.com/' 93 #发起请求 94 response = requests.get(base_url) 95 #解码 96 response.encoding = response.apparent_encoding 97 #获取返回的网页 98 html = response.text 99 # print(html) 100 #解析 101 html =BeautifulSoup(html,'lxml') 102 #匹配最热漫画链接 103 url_list = html.select('ul#cartoon_image_show1 li') 104 for url in url_list: 105 # print(url) 106 url = url.select('a')[0].get('href')[1:] 107 # alt = url.select('a') 108 # print(alt) 109 #拼接链接 110 fullurl = os.path.join(base_url,url) 111 print(fullurl) 112 113 list(fullurl) 114 if __name__ == '__main__': 115 # 用自动测试模块selenium模拟浏览器访问,这里用谷歌 图片加载获取不到图片链接 116 #后面的路径是chorm驱动路径 117 browser = webdriver.Chrome(executable_path=r'C:\Users\zhaozhi\Desktop\chromedriver.exe') 118 shouye()
刚开始自学爬虫不久,代码可能写的有点繁琐,希望和大家一起学习学习进步
版权声明:本文为lyxdw原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。