用Python编写爬取股票信息的代码

北京理工大学崇天老师编写的小课件,很不完善,放进来只是为了方便自己用手机查看

 1 import requests
 2 import re
 3 import bs4
 4 import traceback
 5 
 6 def getHTMLText(url, code = "utf-8"):
 7     # 获得股票页面
 8     try:
 9         r = requests.get(url)
10         r.raise_for_status()
11         r.encoding = code
12         # r.encoding = r.apparent_encoding
13         # 直接用"utf-8"编码节省时间
14         return r.text
15     except:
16         return ""
17 
18 def getStockList(lst, stockURL):
19     # 获取股票列表
20     html = getHTMLText(stockURL, "GB2312")
21     # 东方财富网用"GB2312"方式编码
22     soup = bs4.BeautifulSoup(html, "html.parser")
23     a = soup. find_all("a")
24     for i in a:
25         try:
26             href = i.attrs["href"]
27             lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
28         except:
29             continue
30 
31 def getStockInfo(lst, stockURL, fpath):
32 
33     count = 0
34     # 增加进度条
35 
36     # 获取个股信息
37     for stock in lst:
38         url = stockURL + stock + ".html"
39         html = getHTMLText(url)
40         try:
41             if html == "":
42             # 判断页面是否为空
43                 continue
44             infoDict = { }
45             # 定义一个字典用来储存股票信息
46             soup = bs4.BeautifulSoup(html, "html.parser")
47             stockInfo = soup.find("div", attrs={"class":"stock-bets"})
48             # 获得股票信息标签
49 
50             name = stockInfo.find_all(attrs={"class":"bets-name"})[0]
51             # 在标签中查找股票名称
52             infoDict.update({"股票名称":name.text.split()[0]})
53             # 将股票名称增加到字典中
54 
55             keyList = stockInfo.find_all("dt")
56             # "dt"标签是股票信息键的域
57             valueList = stockInfo.find_all("dd")
58             # "dd"标签是股票信息值的域
59 
60             for i in range(len(keyList)):
61             # 还原键值对并存储到列表中
62                 key = keyList[i].text
63                 val = valueList[i].text
64                 infoDict[key] = val
65 
66             with open(fpath, "a", encoding="utf-8") as f:
67                 f.write(str(infoDict) + "\n")
68 
69                 count += 1
70                 # 增加进度条
71                 print("\r当前进度:{:.2f}%".format(count*100/len(lst)),end = "")
72 
73         except:
74             count += 1
75             # 增加进度条
76             print("\r当前进度:{:.2f}%".format(count * 100 / len(lst)), end="")
77 
78             # 用traceback获得异常信息
79             #traceback.print_exc()
80             continue
81     return ""
82 
83 if __name__ == \'__main__\':
84     stock_list_url = "http://quote.eastmoney.com/stocklist.html"
85     # 获得个股链接
86     stock_info_url = "https://gupiao.baidu.com/stock/"
87     # 获取股票信息的主题部分
88     output_file = "C:\\Users\\W419L\\Desktop\\股票爬取.txt"
89     # 文件保存地址
90     slist = []
91     # 存储股票信息
92     getStockList(slist, stock_list_url)
93     getStockInfo(slist, stock_info_url, output_file)

 

版权声明:本文为csrw原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/csrw/p/10094154.html