BeautifulSoup常见使用
一、BeautifulSoup安装
pip install beautifulsoup4
二、使用示例
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse\'s story</title></head> <body> asdf <div class="title"> <b>The Dormouse\'s story总共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """
soup = BeautifulSoup(html_doc, features="lxml")
1.name标签名称
tag1 = soup.find(\'a\')
print(tag1) #打印第一个a标签内容
name = tag1.name #获取
print(name)
tag1.name = \'span\' #设置标签为span
print(soup) #打印内容
2.attr标签属性
tag2 = soup.find(\'a\') attrs = tag2.attrs #获取第一个a标签所有属性值 print(attrs) link1 = soup.find_all(\'a\',attrs={\'id\':\'link1\'}) #获取所有a标签中,属性有\'id\':\'link1\'的内容 print(link1) tag2.attrs = {\'ik\':123} #设置attrs值 print(tag2.attrs) tag2.attrs[\'id\'] = \'xxxx\' #设置 print(tag2.attrs) tag2.attrs[\'id\'] = \'qqq\' #设置 print(tag2.attrs)
3.find与find_all查找区别
#find匹配是第一个标签 tag3 = soup.find(\'a\') print(tag3) #find_al是查找所有标签 tag4 = soup.find_all(\'a\') print(tag4)
4.clear,将标签的所有子标签全部清空(保留标签名)
tag5 = soup.find(\'body\') tag5.clear() print(soup)
5.has_attr,检查标签是否具有该属性
tag6 = soup.find(\'a\') v = tag6.has_attr(\'id\') print(v)
6.get_text,获取标签内部文本内容
tag7 = soup.find(\'a\') v = tag7.get_text(\'id\') print(v)
7.decompose,递归的删除所有的标签
body = soup.find(\'body\') body.decompose() print(soup)
8.extract,递归的删除所有的标签,并获取删除的标签
body = soup.find(\'body\') body.extract() print(soup)
9.decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
body = soup.find(\'body\') # v = body.decode() v = body.decode_contents() print(v)
10.encode,转换为字节(含当前标签);encode_contents(不含当前标签)
body = soup.find(\'body\') # v = body.encode() v = body.encode_contents() print(v)
11.标签的内容
tag8 = soup.find(\'span\') print(tag8.string) #获取内容 print(tag8) tag8.string = \'new content\' #设置新内容 print(tag8) tag9 = soup.find(\'body\') v = tag9.stripped_strings # 递归内部获取所有标签的文本 print(v) print(next(v))
12.children,所有子标签
body = soup.find(\'body\') v = body.children
13.children,所有子子孙孙标签
body = soup.find(\'body\') v = body.descendants
多余的可以查找官方文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html