python动态网页的爬取
例子:爬取笔趣阁的小说圣墟
1.爬取小说章节的URL
from bs4 import BeautifulSoup
from selenium import webdriver
import re
def book_url():
chromeOptions = webdriver.ChromeOptions()
# 设置代理
chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
# 一定要注意,=两边不能有空格
driver = webdriver.Firefox()
driver.implicitly_wait(30) # 隐性等待,最长等30秒
driver.get(r\'http://www.xbiquge.la/13/13959/\')
txt = driver.page_source
soup = BeautifulSoup(txt, \'html.parser\')
url = re.findall(\'<a href="(.*)">\', str(soup.find_all(\'div\', id=\'list\')))
word = re.findall(\'<a.*>(.*)</a>\', str(soup.find_all(\'div\', id=\'list\')))
word_dict = dict(zip(list(word), list(url)))
driver.quit()
return word_dict
2.爬取小说前200章的内容并写入txt文本中
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import codecs
import crawling.pro_2.py1 as py1
def url():
word_dict = py1.book_url()
word = []
for i in word_dict.values():
word.append(i)
return word
def book(url):
chromeOptions = webdriver.ChromeOptions()
# 设置代理
chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
# 一定要注意,=两边不能有空格,不能是这样--proxy-server = http://202.20.16.82:10152
driver = webdriver.Firefox()
driver.implicitly_wait(30) # 隐性等待,最长等30秒
driver.get(\'http://www.xbiquge.la/\'+url)
txt = driver.page_source
soup = BeautifulSoup(txt, \'html.parser\')
a = soup.find_all(\'div\', id=\'content\')
a = re.sub(r\'<div id="content">\', \'\', str(a))
a = re.sub(r\'</p></div>\', \'\', str(a))
a = re.sub(r\'\xa0\', \'\', str(a))
a = re.sub(r\'<p><a href=\', \'\', str(a))
a = re.sub(r\'target="_blank">\', \'\', str(a))
a = re.sub(r\'</a>\', \'\', str(a))
a = str(a)
line = list(a.split("<br/>"))
name = re.findall(\'<h1>(.*)</h1\', str(soup.find_all(\'div\', class_=\'bookname\')))
name = re.sub("\'", \'\', str(name))
f = codecs.open(\'小说圣墟.txt\', \'a\', \'utf-8\')
kong_list = []
for j in line:
if j == \'\n\':
kong_list.append(j)
for k in kong_list:
line.remove(k)
print(name, end=\'\n\', file=f)
for i in line:
text = re.sub(r\'\n\', \'\', i)
print(text, file=f)
driver.quit()
if __name__ == \'__main__\':
url_list = url()
del url_list[200:]
for time in url_list:
book(time)
版权声明:本文为lihui123原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。