BeautifulSoup的基本用法
from bs4 import BeautifulSoup html = """ <html><head><title>haha,The Dormouse\'s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse\'s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html,\'lxml\') # print(soup.prettify()) # 格式化 print(soup.title) print(soup.title.name) print(soup.title.string) print(soup.title.parent.name) print(soup.p) # p标签 print(soup.p["class"]) print(soup.a) print(soup.find_all(\'a\')) print(soup.find(id=\'link3\')) <title>haha,The Dormouse\'s story</title>title haha,The Dormouse\'s story head <p class="title" name="dromouse"><b>The Dormouse\'s story</b></p> [\'title\'] <a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a> [<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
View Code
html = """ <html><head><title>The Dormouse\'s story</title></head> <body> <p clss="title" name="dromouse"><b>The Dormouse\'s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.p.string) The Dormouse\'s story
View Code
html = """ <html><head><title>The Dormouse\'s story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse\'s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.head.title.string) The Dormouse\'s story
View Code
html = """ <html> <head> <title>The Dormouse\'s story</title> </head> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.p.contents) 略 from bs4 import BeautifulSoup string = \'\'\'<p class="title" name="dromouse"><b>The Dormouse\'s story</b></p>\'\'\' soup = BeautifulSoup(string, \'lxml\') print(soup.p.contents) [<b>The Dormouse\'s story</b>]
View Code
html = """ <html> <head> <title>The Dormouse\'s story</title> </head> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.a.parent) 略 html = """ <html> <head> <title>The Dormouse\'s story</title> </head> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(list(enumerate(soup.a.parents))) # 所有父节点 略
View Code
html = """ <html> <head> <title>The Dormouse\'s story</title> </head> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(list(enumerate(soup.a.next_siblings))) print(list(enumerate(soup.a.previous_siblings))) 略
View Code
find_all( name , attrs , recursive , text , **kwargs )
html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.find_all(\'ul\')) print(type(soup.find_all(\'ul\')[0])) [<ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li></ul>, <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li></ul>] <class \'bs4.element.Tag\'> html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') for ul in soup.find_all(\'ul\'): print(ul.find_all(\'li\')) [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] [<li class="element">Foo</li>, <li class="element">Bar</li>]
View Code
html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1" name="elements"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.find_all(attrs={\'id\': \'list-1\'})) print(soup.find_all(attrs={\'name\': \'elements\'})) [<ul class="list" id="list-1" name="elements"><li class="element">Foo</li><li class="element">Bar</li><li class="element">Jay</li></ul>] [<ul class="list" id="list-1" name="elements"><li class="element">Foo</li><li class="element">Bar</li><li class="element">Jay</li></ul>] html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.find_all(id=\'list-1\')) print(soup.find_all(class_=\'element\')) [<ul class="list" id="list-1"><li class="element">Foo</li><li class="element">Bar</li><li class="element">Jay</li></ul>] [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
View Code
html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.find_all(text=\'Foo\')) [\'Foo\', \'Foo\']
View Code
html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') print(soup.select(\'.panel .panel-heading\')) print(soup.select(\'ul li\')) print(soup.select(\'#list-2 .element\')) print(type(soup.select(\'ul\')[0])) [<div class="panel-heading"><h4>Hello</h4></div>] [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>] [<li class="element">Foo</li>, <li class="element">Bar</li>]<class \'bs4.element.Tag\'> html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') for ul in soup.select(\'ul\'): print(ul.select(\'li\')) [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] [<li class="element">Foo</li>, <li class="element">Bar</li>]
View Code
html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') for ul in soup.select(\'ul\'): print(ul[\'id\']) print(ul.attrs[\'id\']) list-1 list-1 list-2 list-2 # 可以看出两种方式获取属性的效果一样
View Code
html=\'\'\' <div class="panel"> <div class="panel-heading"> <h4>Hello</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ul> </div> </div> \'\'\' from bs4 import BeautifulSoup soup = BeautifulSoup(html, \'lxml\') for li in soup.select(\'li\'): print(li.get_text()) Foo Bar Jay Foo Bar
View Code
-
推荐使用lxml解析库,必要时使用html.parser
-
标签选择筛选功能弱但是速度快建议使用find()、find_all()
-
查询匹配单个结果或者多个结果如果对CSS选择器熟悉建议使用select()
-
记住常用的获取属性和文本值的方法