Python 简单爬虫案例
Python 简单爬虫案例
import requests url = "https://www.sogou.com/web" # 封装参数 wd = input(\'enter a word\') param = { \'query\':wd } response = requests.get(url=url,params=param) page_text = response.content fileName = wd+\'.html\' with open(fileName,\'wb\') as fp: fp.write(page_text) print(\'over\')
需求:爬去搜狗指定词条搜索后的页面数据
import requests url = "https://fanyi.baidu.com/sug" wd = input(\'enter aword\') data = { \'kw\':wd } response = requests.post(url=url,data=data) print(response.json())
需求:抓取百度翻译
import requests url = "https://movie.douban.com/j/chart/top_list" param = { "type": "5", "interval_id": "100:90", "action": "", "start": "40", "limit": "100", } movie_data = requests.get(url=url,params=param).json() print(movie_data)
需求:抓取豆瓣电影分类https://movie.douban.com/排行榜中的电影详情数据
import requests url = \'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx\' wd = input(\'enter aword:\') data = { "cname": \'\', "pid": \'\', "keyword":wd , "pageIndex": "1", "pageSize": "10", } response = requests.post(url=url,data=data).json() print(response)
需求:抓取肯德基餐厅查询http://www.kfc.com.cn/kfccda/index.aspx中指定地点的餐厅数据
http://125.35.6.84:81/xk/ import requests url = \'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList\' headers = { \'User-Agent\':"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" } id_list = [] for page in range(1,11): data = { "on": "true", "page": str(page), "pageSize": "15", "productName": \'\', "conditionType": "1", "applyname": \'\', "applysn": \'\', } json_data = requests.post(url=url,data=data,headers=headers).json() for dic in json_data["list"]: id = dic["ID"] id_list.append(id) detail_url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById" for id in id_list: detail_data = { "id":id } detail_json = requests.post(url=detail_url,data=detail_data,headers=headers).json() print(detail_json)
需求:爬去国家药品监督管理总局中基于中华人民共和国化妆品生产许个证相关数据 http://125.35.6.84:81/xk/
import os import re import urllib import requests url = \'https://www.qiushibaike.com/pic/page/%d/?s=5170552\' # page = 1 headers = { \'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36\' } if not os.path.exists(\'./qiutu\'): os.mkdir(\'./qiutu\') start_page = int(input(\'enter a start pageNum:\')) end_page = int(input(\'enter a end pageNum:\')) for page in range(start_page,end_page+1): new_url = format(url%page) # print(new_url) page_text = requests.get(url=new_url,headers=headers).text img_url_list = re.findall(\'<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>\',page_text,re.S) for img_url in img_url_list: img_url = \'https:\'+img_url imgName = img_url.split(\'/\')[-1] imgPath = \'qiutu/\'+imgName urllib.request.urlretrieve(url=img_url,filename=imgPath) print(imgPath,\'下载成功!\') print(\'over!!!\')
需求:爬去糗事百科中所有图片进行保存
import requests url = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1564643415&di=423648f96f24460811fc7a39e23d29f8&imgtype=jpg&er=1&src=http%3A%2F%2Fimg1.replays.net%2Flol.replays.net%2Fuploads%2Fbody%2F2017%2F06%2F1496734520iBi.jpg" headers = { \'User-Agent\':"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" } img_data = requests.get(url=url,headers=headers).content with open(\'./kapai.jpg\',\'wb\') as fp: fp.write(img_data)
需求:爬取卡牌大师4k照片