tessl/pypi-beautifulsoup4

Python library for pulling data out of HTML and XML files with pluggable parser architecture and intuitive navigation API

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Core Parsing

Name: tessl/pypi-beautifulsoup4
Author: tessl

Primary BeautifulSoup class for parsing HTML and XML documents with configurable parser backends and automatic encoding detection. Handles malformed markup gracefully while providing access to the complete parse tree.

Capabilities

BeautifulSoup Parser

The main parsing class that converts HTML/XML markup into a navigable parse tree using pluggable parser backends.

class BeautifulSoup(Tag):
    def __init__(self, markup="", features=None, builder=None, 
                 parse_only=None, from_encoding=None, **kwargs):
        """
        Parse HTML/XML markup into a navigable tree structure.
        
        Parameters:
        - markup: str, bytes, or file-like object containing HTML/XML
        - features: str or list, parser features ('html.parser', 'lxml', 'html5lib', 'xml')
        - builder: TreeBuilder instance (alternative to features)
        - parse_only: SoupStrainer to parse only matching elements
        - from_encoding: str, character encoding to assume for markup
        - **kwargs: deprecated arguments from BeautifulSoup 3.x
        
        Examples:
        - BeautifulSoup(html_string, 'html.parser')
        - BeautifulSoup(xml_string, 'lxml-xml') 
        - BeautifulSoup(markup, 'html5lib')
        """

Usage Examples:

# Parse HTML with different parsers
from bs4 import BeautifulSoup

html = '<html><body><p>Hello</p></body></html>'

# Built-in HTML parser (slower but always available)
soup = BeautifulSoup(html, 'html.parser')

# lxml parser (faster, requires lxml package)
soup = BeautifulSoup(html, 'lxml')

# html5lib parser (most lenient, handles HTML5)
soup = BeautifulSoup(html, 'html5lib')

# XML parsing with lxml
xml = '<?xml version="1.0"?><root><item>data</item></root>'
soup = BeautifulSoup(xml, 'xml')  # or 'lxml-xml'

# Parse from file
with open('document.html', 'r') as f:
    soup = BeautifulSoup(f, 'html.parser')

# Parse with encoding specification
soup = BeautifulSoup(markup_bytes, 'html.parser', from_encoding='utf-8')

Element Creation

Create new tags and strings that are associated with the soup object and can be inserted into the parse tree.

def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
    """
    Create a new Tag associated with this soup.
    
    Parameters:
    - name: str, tag name
    - namespace: str, XML namespace URI
    - nsprefix: str, XML namespace prefix  
    - **attrs: tag attributes as keyword arguments
    
    Returns:
    Tag instance ready for insertion into parse tree
    """

def new_string(self, s, subclass=NavigableString):
    """
    Create a new NavigableString associated with this soup.
    
    Parameters:
    - s: str, string content
    - subclass: NavigableString subclass (Comment, CData, etc.)
    
    Returns:
    NavigableString instance ready for insertion
    """

def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
    """
    Render the entire soup as Unicode string.
    
    Parameters:
    - pretty_print: bool - format with indentation (default: False)
    - eventual_encoding: str - encoding for XML declaration if XML (default: "utf-8")
    - formatter: str or function - entity formatting ("minimal", "html", "xml")
    
    Returns:
    str - Complete document as Unicode string
    
    Note: BeautifulSoup.decode() differs from Tag.decode() in first parameter
    """

Usage Examples:

from bs4 import BeautifulSoup, Comment

soup = BeautifulSoup('<html><body></body></html>', 'html.parser')

# Create new tag with attributes
new_div = soup.new_tag('div', class_='container', id='main')
new_div.string = 'Content here'

# Create with namespace (XML)
new_item = soup.new_tag('item', namespace='http://example.com/ns')

# Create navigable string
new_text = soup.new_string('Some text content')

# Create comment
new_comment = soup.new_string('This is a comment', Comment)

# Insert into tree
soup.body.append(new_div)
soup.body.append(new_comment)

Parsing Options

Control parsing behavior with features, filters, and encoding options.

# Parser features (can be combined)
features = [
    'html.parser',  # Built-in Python HTML parser
    'lxml',         # lxml HTML parser (fast)
    'lxml-xml',     # lxml XML parser  
    'xml',          # Alias for lxml-xml
    'html5lib',     # html5lib parser (lenient)
    'html',         # HTML parsing mode
    'xml',          # XML parsing mode
    'fast',         # Prefer faster parsers
    'permissive'    # Handle malformed markup
]

# Parse only specific elements
from bs4 import SoupStrainer

# Only parse div tags with class 'content'
parse_only = SoupStrainer('div', class_='content')
soup = BeautifulSoup(markup, 'html.parser', parse_only=parse_only)

# Only parse links
parse_only = SoupStrainer('a')
soup = BeautifulSoup(markup, 'html.parser', parse_only=parse_only)

Parser Information

Access information about the parser used and document characteristics.

# Parser properties
soup.builder          # TreeBuilder instance used
soup.is_xml          # Boolean, True if XML parser was used
soup.original_encoding    # Detected encoding of source markup
soup.declared_html_encoding  # Encoding declared in HTML meta tags
soup.contains_replacement_characters  # Whether encoding conversion lost data

Error Handling

Handle parsing errors and invalid markup gracefully.

class FeatureNotFound(ValueError):
    """Raised when requested parser features are not available"""

class ParserRejectedMarkup(Exception):
    """Raised when parser cannot handle the provided markup"""

Usage Examples:

from bs4 import BeautifulSoup, FeatureNotFound

try:
    # This will fail if lxml is not installed
    soup = BeautifulSoup(markup, 'lxml')
except FeatureNotFound:
    # Fall back to built-in parser
    soup = BeautifulSoup(markup, 'html.parser')

# Handle malformed markup
malformed_html = '<html><body><p>Unclosed paragraph<div>Mixed nesting</body></html>'
soup = BeautifulSoup(malformed_html, 'html.parser')  # Parses successfully

Diagnostic Functions

Debug parsing issues and compare parser performance with diagnostic utilities.

def diagnose(data):
    """
    Comprehensive diagnostic suite for troubleshooting parsing issues.
    
    Tests multiple parsers on the same data and shows results and errors.
    Useful for tech support and debugging parser selection problems.
    
    Parameters:
    - data: str, bytes, file-like object, or filename to parse
    
    Prints diagnostic information including:
    - Beautiful Soup version and Python version
    - Available parsers and their versions  
    - Parse results from each parser
    - Exception traces for failed parsers
    """

def lxml_trace(data, html=True, **kwargs):
    """
    Print lxml parsing events to see raw parser behavior.
    
    Shows the underlying lxml events during parsing without Beautiful Soup.
    
    Parameters:
    - data: str - markup to parse
    - html: bool - use HTML parser mode (default: True)
    - **kwargs: additional lxml parser options
    
    Prints events in format: "event, tag, text"
    """

def htmlparser_trace(data):
    """
    Print HTMLParser events to see raw parser behavior.
    
    Shows the underlying HTMLParser events during parsing without Beautiful Soup.
    
    Parameters:
    - data: str - markup to parse
    
    Prints events like: "TAG START", "DATA", "TAG END"
    """

def benchmark_parsers(num_elements=100000):
    """
    Basic performance benchmark comparing available parsers.
    
    Generates a large invalid HTML document and times parsing with
    different parser backends to compare performance.
    
    Parameters:
    - num_elements: int - size of generated test document
    
    Prints timing results for each available parser
    """

def profile(num_elements=100000, parser="lxml"):
    """
    Profile Beautiful Soup parsing performance in detail.
    
    Uses cProfile to analyze where time is spent during parsing.
    
    Parameters:
    - num_elements: int - size of generated test document  
    - parser: str - parser to profile ("lxml", "html.parser", etc.)
    
    Returns profile statistics for analysis
    """

Usage Examples:

from bs4.diagnose import diagnose, lxml_trace, htmlparser_trace, benchmark_parsers

# Debug parsing problems
problematic_html = '<html><body><p>Malformed HTML...'
diagnose(problematic_html)

# Compare parser performance
benchmark_parsers(50000)

# See raw parser events
lxml_trace('<p>Hello <b>world</b></p>')
htmlparser_trace('<p>Hello <em>world</em></p>')

# Profile for performance optimization
from bs4.diagnose import profile
profile(100000, 'lxml')

Builder and Parser Configuration

Advanced parser configuration and tree builder architecture for customizing parsing behavior.

class TreeBuilder:
    """
    Base class for parser backends that convert markup into Beautiful Soup trees.
    
    Used internally by BeautifulSoup to abstract different parser implementations.
    """
    features = []  # List of supported feature strings
    is_xml = False  # Whether this parser handles XML
    preserve_whitespace_tags = set()  # Tags that preserve whitespace
    empty_element_tags = None  # Tags that can be self-closing
    cdata_list_attributes = {}  # Attributes containing space-separated lists

class HTMLTreeBuilder(TreeBuilder):
    """
    Base class for HTML-specific tree builders.
    
    Defines HTML-specific parsing behavior and tag characteristics.
    """
    preserve_whitespace_tags = {'pre', 'textarea'}
    empty_element_tags = {'br', 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base'}

class TreeBuilderRegistry:
    """
    Registry for managing available parser backends.
    
    Automatically selects appropriate parsers based on requested features.
    """
    def register(self, treebuilder_class): ...
    def lookup(self, *features): ...

# Parser feature constants
FAST = 'fast'
PERMISSIVE = 'permissive' 
STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'

# Global parser registry
builder_registry = TreeBuilderRegistry()

Encoding Detection and Processing

Handle character encoding detection and entity processing.

class UnicodeDammit:
    """
    Automatic character encoding detection and conversion to Unicode.
    
    Handles encoding detection from HTML meta tags, XML declarations,
    byte order marks, and statistical analysis of byte patterns.
    """
    def __init__(self, markup, override_encodings=[], smart_quotes_to="xml", 
                 is_html=True, exclude_encodings=[]): ...
    
    @property
    def unicode_markup(self): ...  # Converted Unicode string
    @property  
    def original_encoding(self): ...  # Detected source encoding

class EntitySubstitution:
    """
    HTML and XML entity encoding and decoding utilities.
    
    Handles conversion between Unicode characters and HTML/XML entities.
    """
    @classmethod
    def substitute_html(cls, s): ...  # Convert to HTML entities
    @classmethod
    def substitute_xml(cls, s): ...   # Convert to XML entities
    @classmethod
    def quoted_attribute_value(cls, value): ...  # Quote attribute values

class HTMLAwareEntitySubstitution(EntitySubstitution):
    """
    Entity substitution that preserves script and style tag contents.
    
    Avoids entity conversion in script and style tags where it would
    break JavaScript or CSS code.
    """
    cdata_containing_tags = {'script', 'style'}
    preformatted_tags = {'pre'}

Usage Examples:

from bs4.builder import builder_registry, FAST, PERMISSIVE
from bs4.dammit import UnicodeDammit, EntitySubstitution

# Check available parsers
available_parsers = []
for builder in builder_registry.builders:
    available_parsers.append(builder.features)
print("Available parsers:", available_parsers)

# Manual encoding detection
raw_data = b'<html><meta charset="latin1"><body>Caf\xe9</body></html>'
dammit = UnicodeDammit(raw_data)
print("Detected encoding:", dammit.original_encoding)
print("Unicode markup:", dammit.unicode_markup)

# Entity handling
text_with_entities = "R&D <division> & \"innovation\""
html_entities = EntitySubstitution.substitute_html(text_with_entities)
xml_entities = EntitySubstitution.substitute_xml(text_with_entities)
print("HTML entities:", html_entities)
print("XML entities:", xml_entities)

# Parser feature lookup
fast_parser = builder_registry.lookup(FAST)
permissive_html_parser = builder_registry.lookup(PERMISSIVE, 'html')

Install with Tessl CLI