1. # coding:utf-8
  2. # 爬取58同城二手电脑信息
  3. # 进入http://bj.58.com/pbdn/0/pn2/页面
  4. # 爬取列表中除转转、推广商品外的正常商品
  5. from bs4 import BeautifulSoup
  6. import requests
  7. import time
  8. def get_links_from(who_sells): # 爬取列表中除转转、推广商品外的正常商品爬取列表中除转转、推广商品外的正常商品的连接
  9. urls = []
  10. list_view = \'http://bj.58.com/pbdn/{}/pn2/\'.format(str(who_sells))
  11. wb_data = requests.get(list_view)
  12. soup = BeautifulSoup(wb_data.text, \'lxml\')
  13. # 通过对页面分析 发现商品链接在 tr > td.t > a.t 中
  14. for link in soup.select(\'tr td.t a.t\'):
  15. if len(link.get(\'href\').split(\'?\')[0]) == 53: # 因为转转商品也符合 tr > td.t > a.t,要排除,观察发现正常商品链接
  16. # 的长度为53, 可通过字符串长度筛选去正常的连接
  17. urls.append(link.get(\'href\').split(\'?\')[0])
  18. return urls
  19. def get_views(url):
  20. id = url.split(\'/\')[-1].strip(\'x.shtml\')
  21. api = \'http://jst1.58.com/counter?infoid={}\'.format(id)
  22. js = requests.get(api)
  23. views = js.text.split(\'=\')[-1]
  24. return views
  25. def get_item_info(who_sells=0): #
  26. urls = get_links_from(who_sells)
  27. for url in urls:
  28. time.sleep(2)
  29. web_data = requests.get(url)
  30. soup = BeautifulSoup(web_data.text, \'lxml\')
  31. data = {
  32. \'title\': soup.title.text,
  33. \'price\': soup.find_all(\'span\', \'price c_f50\')[0].text,
  34. \'area\': list(soup.select(\'.c_25d\')[0].stripped_strings) if soup.find_all(\'span\',\'c_25d\') else None,
  35. \'date\': soup.select(\'.time\')[0].text,
  36. \'cate\': \'个人\' if who_sells == 0 else \'商家\',
  37. \'views\': get_views(url)
  38. }
  39. print(data)
  40. get_item_info()

版权声明:本文为november1943原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/november1943/p/5242490.html