Python爬取博客园首页内容信息，并写入excel表，最后导入Mysql

Mysql和SQLyog安装见我的另外一篇博客https://www.cnblogs.com/ljy1227476113/p/11947066.html

爬虫思路基本是按照我的博客https://www.cnblogs.com/ljy1227476113/p/10913508.html

python中关于excel有两个基本的库

读取 xlrd

写入 xlwt

话不多说直接上代码

 1 import xlrd#读取excel
 2 import xlwt#写入excel
 3 import MySQLdb
 4 import requests
 5 import linecache
 6 import random
 7 from bs4 import BeautifulSoup
 8 
 9 if __name__=="__main__":
10     f = xlwt.Workbook(encoding='utf-8') #创建工作簿
11     sheet1 = f.add_sheet(u'sheet1') #创建sheet
12     row0 = [u'ID',u'name',u'tref',u'comment_num']
13     #生成第一行
14     for i in range(0,len(row0)):
15         sheet1.write(0,i,row0[i])
16 
17     n=0#ID编号
18     target='https://www.cnblogs.com/'#博客园首页
19     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
20     headers = {'User-Agent':user_agent}
21 
22     req=requests.get(url=target)
23     html=req.text
24     html=html.replace('<br>',' ').replace('<br/>',' ').replace('/>','>')
25     bf=BeautifulSoup(html,"html.parser")   
26     texts=bf.find_all('div',class_='post_item_body')
27     #texts_div=texts.find_all('div',class_='wz_content')
28     for item in texts:
29         n=n+1
30         item_name=item.find('a').text#标题
31         item_href=item.find('a')['href']#链接
32         item_refer2=item.find('span',class_='article_comment').text#评论数
33         print('{} {} {}\n'.format(item_name,item_href,item_refer2))
34         mid=[n,item_name,item_href,item_refer2]
35         for i in range(4):#写入excel
36             sheet1.write(n,i,mid[i])
37     print("Done!")
38     f.save('demo1.xls') #保存文件
39 
40     book = xlrd.open_workbook("demo1.xls")#打开excel
41     sheet = book.sheet_by_name("sheet1")
42 #建立一个MySQL连接
43     database = MySQLdb.connect (host="localhost", user = "root", passwd = "111111", db = "mysql")
44 # 获得游标对象, 用于逐行遍历数据库数据
45     cursor = database.cursor()
46 # 创建插入SQL语句
47     query = """INSERT INTO mypython VALUES (%s, %s, %s, %s)"""
48 # 创建一个for循环迭代读取xls文件每行数据的, 从第二行开始是要跳过标题
49     for r in range(1, sheet.nrows):
50         product      = sheet.cell(r,0).value
51         customer = sheet.cell(r,1).value.encode()#python3中string格式要encode()
52         rep          = sheet.cell(r,2).value.encode()
53         date     = sheet.cell(r,3).value.encode()
54 
55         values = (0, customer, rep, date)
56       # 执行sql语句
57         cursor.execute(query, values)
58 # 关闭游标
59     cursor.close()
60 # 提交
61     database.commit()
62 # 关闭数据库连接
63     database.close()
64 # 打印结果
65     print("All Done! ")