> pip install requests > pip install beautifulsoup4
BeautifulSoup | The BeautifulSoup object represents the document to be parsed as a nested data structure. |
BeautifulStoneSoup | |
builder | A module |
builder_registry | A bs4.builder.TreeBuilderRegistry class |
CData | |
Comment | |
Counter | |
dammit | A module |
Declaration | |
DEFAULT_OUTPUT_ENCODING | A string |
Doctype | |
element | A module |
FeatureNotFound | |
formatter | A module |
GuessedAtParserWarning | |
MarkupResemblesLocatorWarning | |
NavigableString | |
os | A module |
PageElement | |
ParserRejectedMarkup | |
ProcessingInstruction | |
PYTHON_SPECIFIC_ENCODINGS | A set |
re | A module (possibly the re module?) |
ResultSet | |
_s | |
Script | |
_soup | |
SoupStrainer | |
StopParsing | |
Stylesheet | |
sys | A module (possibly the sys module?) |
Tag | |
TemplateString | |
traceback | A module |
UnicodeDammit | |
warnings | A module |
XMLParsedAsHTMLWarning |
BeautifulSoup
object represents the document to be parsed as a nested data structure. BeautifulSoup
inherits from Tag
which inherits from PageElement
which inherits from object
. BeautifulSoup
are inherited from those two classes. PageElement
contains the navigational information for some part of the page (i. e. its current location in the parse tree) Tag
represents an HTML or XML tag that is part of a parse tree, along with its attributes and contents. Tag
implements __getattr__()
(which calls find()
under the hood), it's possible to get a reference to a tag like so: doc = BeautifulSoup(htmlTxt, 'html.parser') print(doc.title)
BeautifulSoup
are _all_strings() | |
append() | |
ASCII_SPACES | |
childGenerator() | |
children | |
clear() | |
decode() | |
decode_contents() | |
_decode_markup | Class method? |
decompose() | |
decomposed | |
default | |
DEFAULT_BUILDER_FEATURES | |
DEFAULT_INTERESTING_STRING_TYPES | |
descendants | |
encode() | |
encode_contents() | |
endData() | |
extend() | |
extract() | |
_feed() | |
fetchNextSiblings() | |
fetchParents() | |
fetchPrevious() | |
fetchPreviousSiblings() | |
find() | |
_find_all() | |
findAll() | |
find_all() | |
findAllNext() | |
find_all_next() | |
findAllPrevious() | |
find_all_previous() | |
findChild() | |
findChildren() | |
findNext() | |
find_next() | |
findNextSibling() | |
find_next_sibling() | |
findNextSiblings() | |
find_next_siblings() | |
_find_one() | |
findParent() | |
find_parent() | |
findParents() | |
find_parents() | |
findPrevious() | |
find_previous() | |
findPreviousSibling() | |
find_previous_sibling() | |
findPreviousSiblings() | |
find_previous_siblings() | |
format_string() | |
formatter_for_name() | |
get() | |
get_attribute_list() | |
getText() | |
get_text() | |
handle_data() | |
handle_endtag() | |
handle_starttag() | |
has_attr() | |
has_key() | |
index() | |
insert() | |
insert_after() | |
insert_before() | |
is_empty_element | |
isSelfClosing | |
_is_xml | |
_last_descendant() | |
_lastRecursiveChild() | |
_linkage_fixer() | |
_markup_is_url | Class method? |
_markup_resembles_filename | Class method? |
new_string() | |
new_tag() | |
next | |
next_elements | |
nextGenerator() | |
nextSibling | |
nextSiblingGenerator() | |
next_siblings | |
NO_PARSER_SPECIFIED_WARNING | |
object_was_parsed() | |
parentGenerator() | |
parents | |
parserClass | |
popTag() | |
_popToTag() | |
prettify() | |
previous | |
previous_elements | |
previousGenerator() | |
previousSibling | |
previousSiblingGenerator() | |
previous_siblings | |
pushTag() | |
recursiveChildGenerator() | |
renderContents() | |
replaceWith() | |
replace_with() | |
replaceWithChildren() | |
replace_with_children() | |
reset() | |
ROOT_TAG_NAME | |
select() | |
select_one() | |
setup() | |
_should_pretty_print() | |
smooth() | |
string | |
string_container() | |
strings | |
stripped_strings | |
text | |
unwrap() | |
wrap() |
from bs4 import BeautifulSoup import requests html_text=requests.get('https://github.com/ReneNyffenegger/about-python/tree/master/libraries/BeautifulSoup/script.py').text soup = BeautifulSoup(html_text) print("Title: ", soup.title) print(" .name: ", soup.title.name) print(" .string ", soup.title.string) print(" .parent.name: ", soup.title.parent.name) print() print("Links:") for a in soup.find_all('a'): print(" %-30s: %s" % (a.string, a.get('href')))
from bs4 import BeautifulSoup soup = BeautifulSoup( """<foo><c>text one<sub>ttt</sub>text two<sub>uuu</sub>text three</c></foo>""" ) def descend(node, level): for child in node.contents: if child.name != None: print(" " * level, "<" + child.name+ ">") descend(child, level+1) print(" " * level, "</"+ child.name+ ">") else: print(" " * level, " " + child.string) descend(soup, 0)
>>> import beautifulsoup4 Traceback (most recent call last): File "<stdin>", line 1, in <module> ModuleNotFoundError: No module named 'beautifulsoup4'
>>> import bs4