python爬虫学习之爬取全国各省市县级城市邮政编码
实例需求:运用python语言在http://www.ip138.com/post/网站爬取全国各个省市县级城市的邮政编码,并且保存在excel文件中
实例环境:python3.7
requests库(内置的python库,无需手动安装)
xlwt库(需要自己手动安装)
实例网站:第一步,在http://www.ip138.com/post/网站通过查询源代码可以找到各个省份的链接
第二步,点击链接,即可看到所点击省份的城市的邮政编码
实例代码:
import requests import xlwt # 返回一个字典,键是各个省份的名字,值是对应省份的网址url def getProvinceCode(url): response = requests.get(url) response.encoding = response.apparent_encoding content = response.text start = content.find('<map name="map_86" id="map_86">') + len('<map name="map_86" id="map_86">') + len("\n") end = content.find('</map>') mapStr = content[start:end] #print(mapStr) lines = mapStr.split("\n") baseUrl = 'http://www.ip138.com/' city_urls = [] city_name = [] for line in lines: if line: index1 = line.find('href="/') + len('href="/') index2 = line.find('/"') code = line[index1:index2] url = baseUrl + code city_urls.append(url) title1 = line.find('title="')+len('title="') title2 = line.find('"', title1) title = line[title1:title2] city_name.append(title) dict_prov_url = dict(zip(city_name,city_urls)) for item in dict_prov_url.items(): # 显示各个省份名称和对应的url print(item) return dict_prov_url # 根据url得到省份的各个城市的城市名、邮政编码以及长途区号,返回一个二维的列表。 def getPostCode(url): response = requests.get(url) response.encoding = response.apparent_encoding content = response.text start = content.find('长途区号</b></td></tr>') + len("长途区号</b></td></tr>") end = content.find('</table>', start) add_post = content[start:end] posts = add_post.strip().split('<tr bgcolor="#ffffff">') # posts为每一个去掉<tr bgcolor="#ffffff">组成的列表 code_list = [] for post in posts: if post: lines = post.strip().split('<td') if len(lines) >= 2: if 'nbsp' in lines[4]: if len(lines) >= 6: if 'nbsp' in lines[5]: test = [] city = lines[1][lines[1].find('>')+len('>'):lines[1].find('</')] post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')] area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')] test.append(city) test.append(post_code) test.append(area_code) code_list.append(test) else: test = [] city = lines[1][lines[1].find('<b>')+len('<b>'):lines[1].find('</')] post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')] area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')] test.append(city) test.append(post_code) test.append(area_code) code_list.append(test) else : test1 = [] city = lines[1][lines[1].find('>')+len('>'):lines[1].find('</')] post_code = lines[2][lines[2].find('">')+len('">'):lines[2].find('</')] area_code = lines[3][lines[3].find('">')+len('">'):lines[3].find('</')] test1.append(city) test1.append(post_code) test1.append(area_code) code_list.append(test1) test2 = [] city = lines[4][lines[4].find('>')+len('>'):lines[4].find('</')] post_code = lines[5][lines[5].find('">')+len('">'):lines[5].find('</')] area_code = lines[6][lines[6].find('">')+len('">'):lines[6].find('</')] test2.append(city) test2.append(post_code) test2.append(area_code) code_list.append(test2) showPost(code_list) return code_list # 在终端上显示上面getPostCode(url)函数的得到二维的列表 def showPost(code_list): for i in range(len(code_list)): print(code_list[i]) # 写入excel文件 def write_excel(path): # 创建工作簿 workbook = xlwt.Workbook(encoding='utf-8') # 创建sheet for title,url in getProvinceCode('http://www.ip138.com/post/').items(): data_sheet = workbook.add_sheet(title) row0 = [u'城市名称', u'邮政编码', u'长途区号'] # 每个表的第一行文字,表头 for i in range(len(row0)): data_sheet.write(0, i, row0[i]) code_list = getPostCode(url) for i in range(len(code_list)): # 循环写入所有邮政编码信息 for j in range(len(code_list[i])): data_sheet.write(i+1,j,code_list[i][j]) workbook.save(path) if __name__ == '__main__': path = './postcode.xls' write_excel(path) print(u'写入postcode.xls文件成功')
实例结果:
终端显示:
excel文件: