利用Python编写简单网络爬虫实例3

by:授客 QQ1033553122

实验环境

python版本:3.3.52.7下报错

 

实验目的

获取目标网站“http://bbs.51testing.com/forum.php”中特定url,通过分析发现,目标url同其它url的关系如下

 


 
     

 

 

Python <wbr>利用Python编写简单网络爬虫实例3

 


目标
url存在子页面中的文章中,随机分布,我们要把它找出来

 

python脚本

#!/usr/bin/env python

 

# -*- coding:utf-8 -*-

 

from urllib.request import *

import gzip, re

from io import BytesIO

from html.parser import HTMLParser

 

#
爬虫类

class Reptile:

   
“””to download web pages”””

 

   
def __init__(self):

       
self.url_set = set()  #

用于存储已下载过的页面
url

       
self.data = “”

    

   
#

下载页面
 

   
def get_page(self, url, headers):

       
request = Request(url, headers=headers)

       
request.add_header(\’Accept-encoding\’, \’gzip\’) #
下载经过gzip方式压缩后的网页,减少网络流量

 

       
try:

           
response = urlopen(request) #

发送请求报文

   

           
if response.code == 200: #

请求成功

               
page = response.read() #

读取经压缩后的页面

              

               
if response.info().get(“Content-Encoding”) ==  “gzip”:       

                   
page_data = BytesIO(page)

                   
gzipper = gzip.GzipFile(fileobj = page_data)

                   
self.data = gzipper.read()

               
else:

                   
print(“gzip unused”)

                   
self.data = page_data  #

网页未采用
gzip方式压缩,使用原页面            

       
except Exception:

           
pass

       

       
self.url_set.add(url)

                 

       
return self.data

 

   
#

获取论坛目标版块
url

   
def get_forum_url(self, url_set, home, include):

       
forum_url_set = set() #

用于存放版块
url

       
while len(url_set) > 0:

           
url = url_set.pop()

           
if re.findall(include, url):

               
#

读取的版块
url通常是forum-53-1.html形势的

               
url = home + url

               
forum_url_set.add(url)

       
return forum_url_set

               

   
#

获取版块
url下的帖子url

   
def get_title_url(self, url_set, home, include):

       
title_url_set = set() #

用于存放帖子
url

       
while len(url_set) > 0:

           
url = url_set.pop()

           
if re.findall(include, url):

               
#

读取的帖子
url通常是thread-1044711-1-1.html形式的

               
url = home + url

               
title_url_set.add(url)

       
return title_url_set

  

 

#
解析器类

class MyHtmlParser(HTMLParser):

   
def reset(self):

       
HTMLParser.reset(self)  #

注意顺序

       
self.url_set = set()

       

   
def handle_starttag(self, tag, attrs):

       
#self.url = []

       
url_list = [value for key, value in attrs if “href” ==
key]

       
if url_list:

           
for url in url_list:

               
self.url_set.add(url)

         

   

##############测试################

#
添加头域,伪装浏览器访问网站
,防止一些网站拒绝爬虫访问

headers = {\’User-Agent\’:”Mozilla/5.0 (Windows NT 6.1;
WOW64; rv:33.0) Gecko/20100101 Firefox/33.0″}

 

 

init_url = “http://bbs.51testing.com/forum.php”

 

#
构造解析器

parser = MyHtmlParser(strict = False)

 

#
下载网页

page_number = 1

print(“program is downloading the frist url
page”)

reptile = Reptile()

page = reptile.get_page(init_url, headers)

 

print(“processing the %dth url page” %
page_number)

#
解析网页
(获取url)

parser.feed(str(page))

 

#
获取分类版块
url

home = “http://bbs.51testing.com/”

#
分成多个变量来写,主要是为了书写方便,排版友好

pattern1 =
“forum-122-[1-9]|forum-243-[1-9]|forum-40-[1-9]|forum-63-[1-9]”

pattern2 =
“|forum-42-[1-9]|forum-53-[1-9]|forum-275-[1-9]|forum-140-[-9]”

pattern3 =
“|forum-138-[1-9]|forum-139-[1-9]|forum-141-[1-9]”

pattern = pattern1 + pattern2 + pattern3

include = re.compile(pattern)

 

forum_url_set = reptile.get_forum_url(parser.url_set, home,
include)

 

#
循环,获取每个分类版块下,
1-10子版块的url(10)(翻页页面)

result_url_set = set()

forum_index = 1

for forum_url in forum_url_set:

   
page = reptile.get_page(forum_url, headers)

   
parser.feed(str(page))

   

   
print(“getting the board urls in the %dth forum page” %
forum_index)

   
tmp_url_set = reptile.get_forum_url(parser.url_set, home,
include)

   
forum_index = forum_index + 1

   

   
result_url_set = result_url_set ^ tmp_url_set

 

title_url_set = set()

forum_index = 1

title_index = 1

for forum_url in result_url_set:

   
page = reptile.get_page(forum_url, headers)

   
parser.feed(str(page))

   

   
#

获取版块下的帖子
url

   
pattern1 = “thread-[0-9]{7}-[0-9]{1}-[0-9]{1}[.]html|”

   
pattern2 = “thread-[0-9]{6}-[0-9]{1}-[0-9]{1}[.]html|”

   
pattern3 = “thread-[0-9]{7}-[0-9]{1}-[0-9]{2}[.]html|”

   
pattern4 = “thread-[0-9]{6}-[0-9]{1}-[0-9]{2}[.]html”

   
pattern = pattern1 + pattern2 + pattern3 + pattern4

   
include = re.compile(pattern)

   

   
print(“getting all title urls in the %dth forum board” %
forum_index)

   
tmp_url_set = reptile.get_title_url(parser.url_set, home,
include)

   
forum_index = forum_index + 1

   

   
title_url_set = title_url_set ^ tmp_url_set

 

  

#
获取目标
url

target_index = 1

title_index = 1

filepath = “d:/url.txt”

for title_url in title_url_set:

   
print(“processing the %dth title url” % title_index)

   
page = reptile.get_page(title_url, headers)

   
parser.feed(str(page))

   

   
#

保存目标
url    

   
with open(filepath, “a”) as f:

       
while len(parser.url_set) > 0:

           
url = parser.url_set.pop()

           
pattern =
“http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9]{5}”

           
include = re.compile(pattern)

           
flag = re.findall(include, url)           

           
if flag:

               
print(“find target! saving the %dth target url in the %dth title
page” % (target_index, title_index))

               
f.write(“the %dth url: %s” % (target_index, url))

               
target_index = target_index + 1

  
             f.write(“\n”)

   
title_index = title_index + 1

 

print(“complete”)

 


结果:

 

Python <wbr>利用Python编写简单网络爬虫实例3

 

Python <wbr>利用Python编写简单网络爬虫实例3


声明:仅供学习研究使用,请勿用于其它非法用途

 

版权声明:本文为shouke原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/shouke/p/10157942.html