Python library for pulling data out of HTML and XML files with pluggable parser architecture and intuitive navigation API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Primary BeautifulSoup class for parsing HTML and XML documents with configurable parser backends and automatic encoding detection. Handles malformed markup gracefully while providing access to the complete parse tree.
The main parsing class that converts HTML/XML markup into a navigable parse tree using pluggable parser backends.
class BeautifulSoup(Tag):
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs):
"""
Parse HTML/XML markup into a navigable tree structure.
Parameters:
- markup: str, bytes, or file-like object containing HTML/XML
- features: str or list, parser features ('html.parser', 'lxml', 'html5lib', 'xml')
- builder: TreeBuilder instance (alternative to features)
- parse_only: SoupStrainer to parse only matching elements
- from_encoding: str, character encoding to assume for markup
- **kwargs: deprecated arguments from BeautifulSoup 3.x
Examples:
- BeautifulSoup(html_string, 'html.parser')
- BeautifulSoup(xml_string, 'lxml-xml')
- BeautifulSoup(markup, 'html5lib')
"""Usage Examples:
# Parse HTML with different parsers
from bs4 import BeautifulSoup
html = '<html><body><p>Hello</p></body></html>'
# Built-in HTML parser (slower but always available)
soup = BeautifulSoup(html, 'html.parser')
# lxml parser (faster, requires lxml package)
soup = BeautifulSoup(html, 'lxml')
# html5lib parser (most lenient, handles HTML5)
soup = BeautifulSoup(html, 'html5lib')
# XML parsing with lxml
xml = '<?xml version="1.0"?><root><item>data</item></root>'
soup = BeautifulSoup(xml, 'xml') # or 'lxml-xml'
# Parse from file
with open('document.html', 'r') as f:
soup = BeautifulSoup(f, 'html.parser')
# Parse with encoding specification
soup = BeautifulSoup(markup_bytes, 'html.parser', from_encoding='utf-8')Create new tags and strings that are associated with the soup object and can be inserted into the parse tree.
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""
Create a new Tag associated with this soup.
Parameters:
- name: str, tag name
- namespace: str, XML namespace URI
- nsprefix: str, XML namespace prefix
- **attrs: tag attributes as keyword arguments
Returns:
Tag instance ready for insertion into parse tree
"""
def new_string(self, s, subclass=NavigableString):
"""
Create a new NavigableString associated with this soup.
Parameters:
- s: str, string content
- subclass: NavigableString subclass (Comment, CData, etc.)
Returns:
NavigableString instance ready for insertion
"""
def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
"""
Render the entire soup as Unicode string.
Parameters:
- pretty_print: bool - format with indentation (default: False)
- eventual_encoding: str - encoding for XML declaration if XML (default: "utf-8")
- formatter: str or function - entity formatting ("minimal", "html", "xml")
Returns:
str - Complete document as Unicode string
Note: BeautifulSoup.decode() differs from Tag.decode() in first parameter
"""Usage Examples:
from bs4 import BeautifulSoup, Comment
soup = BeautifulSoup('<html><body></body></html>', 'html.parser')
# Create new tag with attributes
new_div = soup.new_tag('div', class_='container', id='main')
new_div.string = 'Content here'
# Create with namespace (XML)
new_item = soup.new_tag('item', namespace='http://example.com/ns')
# Create navigable string
new_text = soup.new_string('Some text content')
# Create comment
new_comment = soup.new_string('This is a comment', Comment)
# Insert into tree
soup.body.append(new_div)
soup.body.append(new_comment)Control parsing behavior with features, filters, and encoding options.
# Parser features (can be combined)
features = [
'html.parser', # Built-in Python HTML parser
'lxml', # lxml HTML parser (fast)
'lxml-xml', # lxml XML parser
'xml', # Alias for lxml-xml
'html5lib', # html5lib parser (lenient)
'html', # HTML parsing mode
'xml', # XML parsing mode
'fast', # Prefer faster parsers
'permissive' # Handle malformed markup
]
# Parse only specific elements
from bs4 import SoupStrainer
# Only parse div tags with class 'content'
parse_only = SoupStrainer('div', class_='content')
soup = BeautifulSoup(markup, 'html.parser', parse_only=parse_only)
# Only parse links
parse_only = SoupStrainer('a')
soup = BeautifulSoup(markup, 'html.parser', parse_only=parse_only)Access information about the parser used and document characteristics.
# Parser properties
soup.builder # TreeBuilder instance used
soup.is_xml # Boolean, True if XML parser was used
soup.original_encoding # Detected encoding of source markup
soup.declared_html_encoding # Encoding declared in HTML meta tags
soup.contains_replacement_characters # Whether encoding conversion lost dataHandle parsing errors and invalid markup gracefully.
class FeatureNotFound(ValueError):
"""Raised when requested parser features are not available"""
class ParserRejectedMarkup(Exception):
"""Raised when parser cannot handle the provided markup"""Usage Examples:
from bs4 import BeautifulSoup, FeatureNotFound
try:
# This will fail if lxml is not installed
soup = BeautifulSoup(markup, 'lxml')
except FeatureNotFound:
# Fall back to built-in parser
soup = BeautifulSoup(markup, 'html.parser')
# Handle malformed markup
malformed_html = '<html><body><p>Unclosed paragraph<div>Mixed nesting</body></html>'
soup = BeautifulSoup(malformed_html, 'html.parser') # Parses successfullyDebug parsing issues and compare parser performance with diagnostic utilities.
def diagnose(data):
"""
Comprehensive diagnostic suite for troubleshooting parsing issues.
Tests multiple parsers on the same data and shows results and errors.
Useful for tech support and debugging parser selection problems.
Parameters:
- data: str, bytes, file-like object, or filename to parse
Prints diagnostic information including:
- Beautiful Soup version and Python version
- Available parsers and their versions
- Parse results from each parser
- Exception traces for failed parsers
"""
def lxml_trace(data, html=True, **kwargs):
"""
Print lxml parsing events to see raw parser behavior.
Shows the underlying lxml events during parsing without Beautiful Soup.
Parameters:
- data: str - markup to parse
- html: bool - use HTML parser mode (default: True)
- **kwargs: additional lxml parser options
Prints events in format: "event, tag, text"
"""
def htmlparser_trace(data):
"""
Print HTMLParser events to see raw parser behavior.
Shows the underlying HTMLParser events during parsing without Beautiful Soup.
Parameters:
- data: str - markup to parse
Prints events like: "TAG START", "DATA", "TAG END"
"""
def benchmark_parsers(num_elements=100000):
"""
Basic performance benchmark comparing available parsers.
Generates a large invalid HTML document and times parsing with
different parser backends to compare performance.
Parameters:
- num_elements: int - size of generated test document
Prints timing results for each available parser
"""
def profile(num_elements=100000, parser="lxml"):
"""
Profile Beautiful Soup parsing performance in detail.
Uses cProfile to analyze where time is spent during parsing.
Parameters:
- num_elements: int - size of generated test document
- parser: str - parser to profile ("lxml", "html.parser", etc.)
Returns profile statistics for analysis
"""Usage Examples:
from bs4.diagnose import diagnose, lxml_trace, htmlparser_trace, benchmark_parsers
# Debug parsing problems
problematic_html = '<html><body><p>Malformed HTML...'
diagnose(problematic_html)
# Compare parser performance
benchmark_parsers(50000)
# See raw parser events
lxml_trace('<p>Hello <b>world</b></p>')
htmlparser_trace('<p>Hello <em>world</em></p>')
# Profile for performance optimization
from bs4.diagnose import profile
profile(100000, 'lxml')Advanced parser configuration and tree builder architecture for customizing parsing behavior.
class TreeBuilder:
"""
Base class for parser backends that convert markup into Beautiful Soup trees.
Used internally by BeautifulSoup to abstract different parser implementations.
"""
features = [] # List of supported feature strings
is_xml = False # Whether this parser handles XML
preserve_whitespace_tags = set() # Tags that preserve whitespace
empty_element_tags = None # Tags that can be self-closing
cdata_list_attributes = {} # Attributes containing space-separated lists
class HTMLTreeBuilder(TreeBuilder):
"""
Base class for HTML-specific tree builders.
Defines HTML-specific parsing behavior and tag characteristics.
"""
preserve_whitespace_tags = {'pre', 'textarea'}
empty_element_tags = {'br', 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base'}
class TreeBuilderRegistry:
"""
Registry for managing available parser backends.
Automatically selects appropriate parsers based on requested features.
"""
def register(self, treebuilder_class): ...
def lookup(self, *features): ...
# Parser feature constants
FAST = 'fast'
PERMISSIVE = 'permissive'
STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
# Global parser registry
builder_registry = TreeBuilderRegistry()Handle character encoding detection and entity processing.
class UnicodeDammit:
"""
Automatic character encoding detection and conversion to Unicode.
Handles encoding detection from HTML meta tags, XML declarations,
byte order marks, and statistical analysis of byte patterns.
"""
def __init__(self, markup, override_encodings=[], smart_quotes_to="xml",
is_html=True, exclude_encodings=[]): ...
@property
def unicode_markup(self): ... # Converted Unicode string
@property
def original_encoding(self): ... # Detected source encoding
class EntitySubstitution:
"""
HTML and XML entity encoding and decoding utilities.
Handles conversion between Unicode characters and HTML/XML entities.
"""
@classmethod
def substitute_html(cls, s): ... # Convert to HTML entities
@classmethod
def substitute_xml(cls, s): ... # Convert to XML entities
@classmethod
def quoted_attribute_value(cls, value): ... # Quote attribute values
class HTMLAwareEntitySubstitution(EntitySubstitution):
"""
Entity substitution that preserves script and style tag contents.
Avoids entity conversion in script and style tags where it would
break JavaScript or CSS code.
"""
cdata_containing_tags = {'script', 'style'}
preformatted_tags = {'pre'}Usage Examples:
from bs4.builder import builder_registry, FAST, PERMISSIVE
from bs4.dammit import UnicodeDammit, EntitySubstitution
# Check available parsers
available_parsers = []
for builder in builder_registry.builders:
available_parsers.append(builder.features)
print("Available parsers:", available_parsers)
# Manual encoding detection
raw_data = b'<html><meta charset="latin1"><body>Caf\xe9</body></html>'
dammit = UnicodeDammit(raw_data)
print("Detected encoding:", dammit.original_encoding)
print("Unicode markup:", dammit.unicode_markup)
# Entity handling
text_with_entities = "R&D <division> & \"innovation\""
html_entities = EntitySubstitution.substitute_html(text_with_entities)
xml_entities = EntitySubstitution.substitute_xml(text_with_entities)
print("HTML entities:", html_entities)
print("XML entities:", xml_entities)
# Parser feature lookup
fast_parser = builder_registry.lookup(FAST)
permissive_html_parser = builder_registry.lookup(PERMISSIVE, 'html')Install with Tessl CLI
npx tessl i tessl/pypi-beautifulsoup4