使用python书写的小说爬虫

　　1.写了一个简单的网络爬虫

　　初期1 (后期将会继续完善)

#小说的爬取

import requests
import random
from bs4 import BeautifulSoup

baseurl = "https://www.biqukan.com";

header = [{\'User-Agent\': \'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36\'},{\'User-Agent\':\'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50\'},{\'User-Agent\':\'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)\'},{\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1\'},{\'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)\'}]
# header 是用来伪装成浏览器发送请求，一般加上最好，header 信息可以通过浏览器查看，也可在网上搜索得到。
req = requests.get(baseurl + \'/s.php?ie=gbk&s=2758772450457967865&q=一念永恒\',headers = header[random.randint(0,4)])   # 向目标网站发送 get 请求
result = req.content
result = result.decode(\'gbk\')    #  查看网页源代码 看到 charset=gbk，即网页是用的 gbk 编码，故要用 gkb 的编码方式来解码，否则中文就会乱码。

# print(result);  #得到内容
req_parser = BeautifulSoup(result,"html.parser");
bookbox = req_parser.find_all(\'div\',class_ =\'bookbox\');
# a_bf = BeautifulSoup(str(div),"html.parser"); # 进行进一步的字符解析因为获取要素类型的值时必须进行这一步

# 对当前的结果进行遍历得到想要的数据
for result in bookbox:
    resu = BeautifulSoup(str(result),"html.parser");
    book_image = resu.find_all(\'img\')[0].get(\'src\');  # 得到书名的图片
    book_name = resu.h4.a.string;  # 得到书的名称
    book_author = resu.find(\'div\',class_ = \'author\').string; #得到书的作者
    book_href = resu.h4.a.get(\'href\'); # 得到书的链接
    book_update_name = resu.find(\'div\',class_ = \'update\').a.string; # 最新章节名称
    book_update_name_href = resu.find(\'div\',class_ = \'update\').a.get(\'href\'); # z最新章节链接

2. 初期2 (后期将完善具体的章节内容,当前是一个书的搜索列表) **如果当前文件夹下面没有book.txt 文件就会自动创建,如果有会进行覆盖)

#小说的爬取  爬取的是书名的所有列表 根据书名或者作者进行搜索的列表

import requests
import random
from bs4 import BeautifulSoup

# 定义一个存储书的列表

book_list = [];

# 定义一个对象存储内容
class Book:
    book_image = \'\';
    book_name = \'\';
    book_author = \'\';
    book_href = \'\';
    book_update_name = \'\';
    book_update_name_href = \'\';
    
    def tostring(self):
        return """
        图片地址=%s
        书名=%s
        书作者=%s
        书链接=%s
        最新章节名称=%s
        最新章节地址=%s""" %(self.book_image,self.book_name,self.book_author,self.book_href,self.book_update_name,self.book_update_name_href);
    
  

baseurl = "https://www.biqukan.com";

input_book_name = input("请输入书名称==>"); 

header = [{\'User-Agent\': \'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36\'},{\'User-Agent\':\'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50\'},{\'User-Agent\':\'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)\'},{\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1\'},{\'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)\'}]
# header 是用来伪装成浏览器发送请求，一般加上最好，header 信息可以通过浏览器查看，也可在网上搜索得到。
req = requests.get(baseurl + \'/s.php?ie=gbk&s=2758772450457967865&q=\' + input_book_name,headers = header[random.randint(0,4)])   # 向目标网站发送 get 请求
result = req.content
result = result.decode(\'gbk\')    #  查看网页源代码 看到 charset=gbk，即网页是用的 gbk 编码，故要用 gkb 的编码方式来解码，否则中文就会乱码。

# print(result);  #得到内容
req_parser = BeautifulSoup(result,"html.parser");
bookbox = req_parser.find_all(\'div\',class_ =\'bookbox\');
# a_bf = BeautifulSoup(str(div),"html.parser"); # 进行进一步的字符解析因为获取要素类型的值时必须进行这一步

# 对当前的结果进行遍历得到想要的数据
for result in bookbox:
    resu = BeautifulSoup(str(result),"html.parser");
    book_image = resu.find_all(\'img\')[0].get(\'src\');  # 得到书名的图片
    book_name = resu.h4.a.string;  # 得到书的名称
    book_author = resu.find(\'div\',class_ = \'author\').string; #得到书的作者
    book_href = resu.h4.a.get(\'href\'); # 得到书的链接
    book_update_name = resu.find(\'div\',class_ = \'update\').a.string; # 最新章节名称
    book_update_name_href = resu.find(\'div\',class_ = \'update\').a.get(\'href\'); # z最新章节链接
    book = Book();
    book.book_image = baseurl + book_image;
    book.book_name = book_name;
    book.book_author = book_author;
    book.book_href = baseurl + book_href;
    book.book_update_name = book_update_name;
    book.book_update_name_href = baseurl + book_update_name_href;
    book_list.append(book);


    
with open(\'./book.txt\',\'w+\') as fw: # 打开文件
     for i in book_list:
        fw.write("\n")
        fw.write("***********************************************************************************************")
        fw.write("\n")
        fw.write(i.tostring())

本文链接：https://www.cnblogs.com/chengyangyang/p/10304182.html

使用python书写的小说爬虫

使用python书写的小说爬虫的更多相关文章

随机推荐

热门专题

目录导航