抓取地址所有图片

  1. #! /usr/bin/env python
  2. from urlparse import urlsplit
  3. from os.path import basename
  4. import urllib2
  5. import re
  6. import requests
  7. import os
  8. import json
  9. url = \'https://www.zhihu.com/question/37787176\'
  10.  
  11. if not os.path.exists(\'images\'):
  12. os.mkdir("images")
  13. print("start>>>>>>>")
  14. page_size = 50
  15. offset = 0
  16. url_content = urllib2.urlopen(url).read()
  17. answers = re.findall(\'h3 data-num="(.*?)"\', url_content)
  18. limits = int(answers[0])
  19. while offset < limits:
  20. post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
  21. params = json.dumps({
  22. \'url_token\': 37787176,
  23. \'pagesize\': page_size,
  24. \'offset\': offset
  25. })
  26. data = {
  27. \'_xsrf\': \'\',
  28. \'method\': \'next\',
  29. \'params\': params
  30. }
  31. header = {
  32. \'User-Agent\': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
  33. \'Host\': "www.zhihu.com",
  34. \'Referer\': url
  35. }
  36. response = requests.post(post_url, data=data, headers=header)
  37. answer_list = response.json()["msg"]
  38. img_urls = re.findall(\'img .*?src="(.*?_b.*?)"\', \'\'.join(answer_list))
  39. for img_url in img_urls:
  40. try:
  41. img_data = urllib2.urlopen(img_url).read()
  42. file_name = basename(urlsplit(img_url)[2])
  43. print(file_name)
  44. output = open(\'images/\' + file_name, \'wb\')
  45. output.write(img_data)
  46. output.close()
  47. except:
  48. pass
  49. offset += page_size
  50. print("end>>>>>>>")

正则抓取网页title

  1. #!/usr/bin/python
  2. # coding:utf-8
  3. import httplib2
  4. import urllib2
  5. import re #正则表达式模块
  6.  
  7. class PageClass:
  8. #获取指定url的网页内容
  9. def get_page(self,url,headers):
  10. http=httplib2.Http()
  11. response,content=http.request(url,\'GET\',headers=headers)
  12. return content.decode(\'utf-8\')
  13. def main():
  14. headers={"cookie":\'your cookie\'}
  15. url = \'http://v.ktgj.com\'
  16. #print headers
  17. page = PageClass()
  18. content = page.get_page(url,headers)
  19. return content
  20. if __name__ == "__main__":
  21. htmltext = main()
  22. pattern = re.compile(r\'<title>(.*?)</title>\')
  23. match = pattern.match(htmltext)
  24. if match:
  25. print match.group()
  26. print htmltext

下载网页图片

  1. #! /usr/bin/env python
  2. from urlparse import urlsplit
  3. from os.path import basename
  4. import urllib2
  5. import re
  6. import requests
  7. import os
  8. import json
  9. import datetime
  10. if not os.path.exists(\'images\'):
  11. os.mkdir("images")
  12. print("start>>>>>>>>>>>>>>>>>>>>>>>")
  13. url = "http://www.ssff66.com/se/jingpintaotu/519271.html"
  14. response = requests.get(url)
  15. #print(response.text)
  16. img_urls = re.findall(\'img .*?src="(.*?)"\', response.text)
  17. #print(img_urls)
  18.  
  19. for img_url in img_urls:
  20. try:
  21. img_data = urllib2.urlopen(img_url,timeout = 5).read()
  22. file_name = basename(urlsplit(img_url)[2])
  23. print(datetime.datetime.now().strftime(\'%Y-%m-%d %H:%M:%S\') + " " + file_name)
  24. output = open(\'images/\' + file_name, \'wb\')
  25. output.write(img_data)
  26. output.close()
  27. except Exception,e:
  28. print("error : " + e.message)
  29. pass
  30.  
  31. print("end>>>>>>>>>>>>>>>>>>>>>>>")

 

版权声明:本文为jhli原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/jhli/p/5915329.html