【Python爬虫基础】抓取知乎页面所有图片
抓取地址所有图片
- #! /usr/bin/env python
- from urlparse import urlsplit
- from os.path import basename
- import urllib2
- import re
- import requests
- import os
- import json
- url = \'https://www.zhihu.com/question/37787176\'
- if not os.path.exists(\'images\'):
- os.mkdir("images")
- print("start>>>>>>>")
- page_size = 50
- offset = 0
- url_content = urllib2.urlopen(url).read()
- answers = re.findall(\'h3 data-num="(.*?)"\', url_content)
- limits = int(answers[0])
- while offset < limits:
- post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
- params = json.dumps({
- \'url_token\': 37787176,
- \'pagesize\': page_size,
- \'offset\': offset
- })
- data = {
- \'_xsrf\': \'\',
- \'method\': \'next\',
- \'params\': params
- }
- header = {
- \'User-Agent\': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
- \'Host\': "www.zhihu.com",
- \'Referer\': url
- }
- response = requests.post(post_url, data=data, headers=header)
- answer_list = response.json()["msg"]
- img_urls = re.findall(\'img .*?src="(.*?_b.*?)"\', \'\'.join(answer_list))
- for img_url in img_urls:
- try:
- img_data = urllib2.urlopen(img_url).read()
- file_name = basename(urlsplit(img_url)[2])
- print(file_name)
- output = open(\'images/\' + file_name, \'wb\')
- output.write(img_data)
- output.close()
- except:
- pass
- offset += page_size
- print("end>>>>>>>")
正则抓取网页title
- #!/usr/bin/python
- # coding:utf-8
- import httplib2
- import urllib2
- import re #正则表达式模块
- class PageClass:
- #获取指定url的网页内容
- def get_page(self,url,headers):
- http=httplib2.Http()
- response,content=http.request(url,\'GET\',headers=headers)
- return content.decode(\'utf-8\')
- def main():
- headers={"cookie":\'your cookie\'}
- url = \'http://v.ktgj.com\'
- #print headers
- page = PageClass()
- content = page.get_page(url,headers)
- return content
- if __name__ == "__main__":
- htmltext = main()
- pattern = re.compile(r\'<title>(.*?)</title>\')
- match = pattern.match(htmltext)
- if match:
- print match.group()
- print htmltext
下载网页图片
- #! /usr/bin/env python
- from urlparse import urlsplit
- from os.path import basename
- import urllib2
- import re
- import requests
- import os
- import json
- import datetime
- if not os.path.exists(\'images\'):
- os.mkdir("images")
- print("start>>>>>>>>>>>>>>>>>>>>>>>")
- url = "http://www.ssff66.com/se/jingpintaotu/519271.html"
- response = requests.get(url)
- #print(response.text)
- img_urls = re.findall(\'img .*?src="(.*?)"\', response.text)
- #print(img_urls)
- for img_url in img_urls:
- try:
- img_data = urllib2.urlopen(img_url,timeout = 5).read()
- file_name = basename(urlsplit(img_url)[2])
- print(datetime.datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\') + " " + file_name)
- output = open(\'images/\' + file_name, \'wb\')
- output.write(img_data)
- output.close()
- except Exception,e:
- print("error : " + e.message)
- pass
- print("end>>>>>>>>>>>>>>>>>>>>>>>")
版权声明:本文为jhli原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。