Python library for pulling data out of HTML and XML files with pluggable parser architecture and intuitive navigation API
npx @tessl/cli install tessl/pypi-beautifulsoup4@4.3.0Beautiful Soup is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work by providing a Pythonic API for parsing documents with malformed markup.
pip install beautifulsoup4html.parser (included with Python)pip install lxml (faster, supports XML)pip install html5lib (pure Python, handles HTML5)from bs4 import BeautifulSoupAdditional classes for advanced usage:
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4 import CData, ProcessingInstruction, Doctype
from bs4 import SoupStrainer, ResultSetDiagnostic and configuration imports:
from bs4.diagnose import diagnose, lxml_trace, htmlparser_trace, benchmark_parsers, profile
from bs4.builder import builder_registry, TreeBuilder, HTMLTreeBuilder
from bs4.dammit import UnicodeDammit, EntitySubstitutionfrom bs4 import BeautifulSoup
# Parse HTML content
html = '<html><head><title>Sample Page</title></head><body><p class="content">Hello, world!</p></body></html>'
soup = BeautifulSoup(html, 'html.parser')
# Navigate the parse tree
title = soup.title.string
print(title) # "Sample Page"
# Find elements by tag
paragraph = soup.find('p')
print(paragraph.get_text()) # "Hello, world!"
# Find elements by CSS class
content = soup.find('p', class_='content')
print(content['class']) # ['content']
# Use CSS selectors
content = soup.select('p.content')[0]
print(content.get_text()) # "Hello, world!"
# Modify the tree
new_tag = soup.new_tag('span', id='highlight')
new_tag.string = 'Important!'
paragraph.append(new_tag)
# Output modified HTML
print(soup.prettify())Beautiful Soup uses a layered architecture that separates parsing from tree manipulation:
This design enables Beautiful Soup to handle malformed markup gracefully while providing an intuitive Pythonic API for web scraping, document processing, and HTML/XML manipulation tasks.
Primary BeautifulSoup class for parsing HTML and XML documents with configurable parser backends and encoding detection.
class BeautifulSoup(Tag):
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs): ...
def new_tag(self, name, namespace=None, nsprefix=None, **attrs): ...
def new_string(self, s, subclass=NavigableString): ...Navigate through the parse tree using parent-child relationships, sibling traversal, and document-order iteration with both property access and generator-based approaches.
# Navigation properties
@property
def parent(self): ...
@property
def next_sibling(self): ...
@property
def previous_sibling(self): ...
@property
def next_element(self): ...
@property
def previous_element(self): ...
# Navigation generators
@property
def parents(self): ...
@property
def next_siblings(self): ...
@property
def previous_siblings(self): ...
@property
def next_elements(self): ...
@property
def previous_elements(self): ...Find elements using tag names, attributes, text content, CSS selectors, and custom matching functions with both single and multiple result options.
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): ...
def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): ...
def select(self, selector): ...
def select_one(self, selector): ...
# Directional search
def find_next(self, name=None, attrs={}, text=None, **kwargs): ...
def find_previous(self, name=None, attrs={}, text=None, **kwargs): ...
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): ...
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): ...
def find_parent(self, name=None, attrs={}, **kwargs): ...Modify the parse tree by inserting, removing, replacing elements and their attributes with automatic relationship maintenance.
def extract(self): ...
def decompose(self): ...
def replace_with(self, *args): ...
def wrap(self, wrap_inside): ...
def unwrap(self): ...
def insert(self, position, new_child): ...
def insert_before(self, *args): ...
def insert_after(self, *args): ...
def append(self, tag): ...
def clear(self, decompose=False): ...Extract text content, attribute values, and formatted output from parse tree elements with flexible filtering and formatting options.
def get_text(self, separator="", strip=False, types=(NavigableString,)): ...
def get(self, key, default=None): ...
def has_attr(self, key): ...
@property
def string(self): ...
@property
def strings(self): ...
@property
def stripped_strings(self): ...
@property
def text(self): ...Render parse tree elements as formatted HTML/XML with encoding control, pretty-printing, and entity substitution options.
def encode(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): ...
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): ...
def prettify(self, encoding=None, formatter="minimal"): ...
def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): ...
def encode_contents(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): ...class PageElement:
"""Base class for all parse tree elements"""
class NavigableString(str, PageElement):
"""Text content within tags"""
class PreformattedString(NavigableString):
"""Text that should preserve original formatting"""
class Tag(PageElement):
"""HTML/XML elements with attributes and children"""
name: str
attrs: dict
contents: list
class Comment(NavigableString):
"""HTML/XML comments"""
class CData(NavigableString):
"""CDATA sections"""
class ProcessingInstruction(NavigableString):
"""XML processing instructions"""
class Doctype(NavigableString):
"""DOCTYPE declarations"""
class SoupStrainer:
"""Search criteria for filtering elements"""
def __init__(self, name=None, attrs={}, text=None, **kwargs): ...
class ResultSet(list):
"""List of search results with source tracking"""
class FeatureNotFound(ValueError):
"""Raised when requested parser features are not available"""
class StopParsing(Exception):
"""Exception to stop parsing early"""
class ParserRejectedMarkup(Exception):
"""Raised when parser cannot handle the provided markup"""