python动态网页的爬取

例子：爬取笔趣阁的小说圣墟

1.爬取小说章节的URL

from bs4 import BeautifulSoup
from selenium import webdriver
import re

def book_url():
    chromeOptions = webdriver.ChromeOptions()
# 设置代理
    chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
# 一定要注意，=两边不能有空格
    driver = webdriver.Firefox()
    driver.implicitly_wait(30)  # 隐性等待，最长等30秒
    driver.get(r\'http://www.xbiquge.la/13/13959/\')
    txt = driver.page_source
    soup = BeautifulSoup(txt, \'html.parser\')
    url = re.findall(\'<a href="(.*)">\', str(soup.find_all(\'div\', id=\'list\')))
    word = re.findall(\'<a.*>(.*)</a>\', str(soup.find_all(\'div\', id=\'list\')))
    word_dict = dict(zip(list(word), list(url)))
    driver.quit()
return word_dict

2.爬取小说前200章的内容并写入txt文本中

from bs4 import BeautifulSoup
from selenium import webdriver
import re
import codecs
import crawling.pro_2.py1 as py1


def url():
    word_dict = py1.book_url()
    word = []
for i in word_dict.values():
        word.append(i)
return word


def book(url):
    chromeOptions = webdriver.ChromeOptions()
# 设置代理
    chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
# 一定要注意，=两边不能有空格，不能是这样--proxy-server = http://202.20.16.82:10152
    driver = webdriver.Firefox()
    driver.implicitly_wait(30)  # 隐性等待，最长等30秒
    driver.get(\'http://www.xbiquge.la/\'+url)
    txt = driver.page_source
    soup = BeautifulSoup(txt, \'html.parser\')
    a = soup.find_all(\'div\', id=\'content\')
    a = re.sub(r\'<div id="content">\', \'\', str(a))
    a = re.sub(r\'</p></div>\', \'\', str(a))
    a = re.sub(r\'\xa0\', \'\', str(a))
    a = re.sub(r\'<p><a href=\', \'\', str(a))
    a = re.sub(r\'target="_blank">\', \'\', str(a))
    a = re.sub(r\'</a>\', \'\', str(a))
    a = str(a)
    line = list(a.split("<br/>"))
    name = re.findall(\'<h1>(.*)</h1\', str(soup.find_all(\'div\', class_=\'bookname\')))
    name = re.sub("\'", \'\', str(name))
    f = codecs.open(\'小说圣墟.txt\', \'a\', \'utf-8\')
    kong_list = []
for j in line:
if j == \'\n\':
            kong_list.append(j)
for k in kong_list:
        line.remove(k)
print(name, end=\'\n\', file=f)
for i in line:
        text = re.sub(r\'\n\', \'\', i)
print(text, file=f)
    driver.quit()


if __name__ == \'__main__\':
    url_list = url()
del url_list[200:]
for time in url_list:
        book(time)

本文链接：https://www.cnblogs.com/lihui123/p/12806955.html

python动态网页的爬取

python动态网页的爬取的更多相关文章

随机推荐

热门专题

目录导航