1 #html文本提取
 2 from bs4 import BeautifulSoup
 3 html_sample = \'\
 4 <html> \
 5 <body> \
 6 <h1 id = "title">Hello world</h1>\
 7 <a href = "#www.baidu.com" class = "link"> This is link1</a>\
 8 <a href = "#link2" class = "link"> This is link2</a> \
 9 </body> \
10 </html>\'
11 soup = BeautifulSoup(html_sample,\'html.parser\')
12 print(soup.text)
13 soup.select(\'h1\')
14 print(soup.select(\'h1\')[0].text)
15 print(soup.select(\'a\')[0].text)
16 print(soup.select(\'a\')[1].text)
17 
18 for alink in soup.select(\'a\'):
19     print(alink.text)
20 
21 print(soup.select(\'#title\')[0].text)
22 print(soup.select(\'.link\')[0].text)
23 
24 alinks = soup.select(\'a\')
25 for link in alinks:
26     print(link[\'href\'])

demo2:

 1 import requests
 2 from bs4 import BeautifulSoup
 3 res = requests.get(\'http://news.qq.com/\')
 4 soup = BeautifulSoup(res.text,\'html.parser\')
 5 newsary = []
 6 for news in soup.select(\'.Q-tpWrap .text\'):
 7     newsary.append({\'title\':news.select(\'a\')[0].text, \'url\':news.select(\'a\')[0][\'href\']})
 8 
 9 import pandas 
10 newsdf = pandas.DataFrame(newsary)
11 newsdf.to_excel(\'news.xlsx\')

 推荐使用:Jupyter Notebook 做练习,很方便。

版权声明:本文为hujianglang原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/hujianglang/p/9650329.html