tessl/pypi-beautifulsoup4

Python library for pulling data out of HTML and XML files with pluggable parser architecture and intuitive navigation API

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Output and Serialization

Name: tessl/pypi-beautifulsoup4
Author: tessl

Render parse tree elements as formatted HTML/XML with encoding control, pretty-printing, and entity substitution options. Beautiful Soup provides flexible output methods for converting parse trees back to markup strings with various formatting and encoding options.

Capabilities

Basic Output Methods

Convert elements to string representations with different encoding and formatting options.

def __str__(self):
    """
    Default string representation using UTF-8 encoding.
    
    Returns:
    str - HTML/XML markup
    """

def __unicode__(self):
    """
    Unicode string representation (Python 2 compatibility).
    
    Returns:
    unicode - HTML/XML markup
    """

def encode(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"):
    """
    Render element to bytes with specified encoding.
    
    Parameters:
    - encoding: str - character encoding (default: "utf-8")
    - indent_level: int or None - indentation level for pretty printing
    - formatter: str or function - entity formatting ("minimal", "html", "xml", or custom)
    - errors: str - encoding error handling ("xmlcharrefreplace", "strict", etc.)
    
    Returns:
    bytes - encoded markup
    """

def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
    """
    Render element to Unicode string.
    
    Parameters:
    - indent_level: int or None - indentation level for pretty printing  
    - eventual_encoding: str - encoding for XML declaration (XML only)
    - formatter: str or function - entity formatting
    
    Returns:
    str - Unicode markup
    """

Usage Examples:

from bs4 import BeautifulSoup

html = '<div><p>Hello <em>world</em>!</p></div>'
soup = BeautifulSoup(html, 'html.parser')

div = soup.find('div')

# Basic string conversion
print(str(div))  # <div><p>Hello <em>world</em>!</p></div>

# Encode to bytes
utf8_bytes = div.encode('utf-8')
print(type(utf8_bytes))  # <class 'bytes'>

latin1_bytes = div.encode('latin-1')
ascii_bytes = div.encode('ascii', errors='xmlcharrefreplace')

# Decode to Unicode string  
unicode_str = div.decode()
print(type(unicode_str))  # <class 'str'>

# With different encodings in XML
xml = '<?xml version="1.0"?><root><item>content</item></root>'
xml_soup = BeautifulSoup(xml, 'xml')
xml_output = xml_soup.decode(eventual_encoding='iso-8859-1')
print(xml_output)  # Includes encoding declaration

Pretty Printing

Format output with indentation and line breaks for human readability.

def prettify(self, encoding=None, formatter="minimal"):
    """
    Render with pretty formatting (indentation and line breaks).
    
    Parameters:
    - encoding: str or None - if specified, return bytes; if None, return str
    - formatter: str or function - entity formatting
    
    Returns:
    str or bytes - formatted markup
    """

# Pretty printing uses these rules:
# - Each tag gets its own line
# - Child elements are indented
# - Text content may be wrapped
# - Empty tags use minimal formatting

Usage Examples:

html = '<html><head><title>Page</title></head><body><div class="content"><p>Paragraph 1</p><p>Paragraph 2</p></div></body></html>'
soup = BeautifulSoup(html, 'html.parser')

# Pretty print as string
pretty_str = soup.prettify()
print(pretty_str)
# Output:
# <html>
#  <head>
#   <title>
#    Page
#   </title>
#  </head>
#  <body>
#   <div class="content">
#    <p>
#     Paragraph 1
#    </p>
#    <p>
#     Paragraph 2
#    </p>
#   </div>
#  </body>
# </html>

# Pretty print as bytes
pretty_bytes = soup.prettify(encoding='utf-8')
print(type(pretty_bytes))  # <class 'bytes'>

# Pretty print specific elements
div = soup.find('div')
print(div.prettify())
# <div class="content">
#  <p>
#   Paragraph 1
#  </p>
#  <p>
#   Paragraph 2
#  </p>
# </div>

Content-Only Output

Render just the contents of elements without the container tags.

def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
    """
    Render only the contents (children) as Unicode string.
    
    Parameters:
    - indent_level: int or None - indentation level
    - eventual_encoding: str - encoding for XML declaration  
    - formatter: str or function - entity formatting
    
    Returns:
    str - contents as Unicode markup
    """

def encode_contents(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"):
    """
    Render only the contents (children) as bytes.
    
    Parameters:
    - encoding: str - character encoding
    - indent_level: int or None - indentation level
    - formatter: str or function - entity formatting
    - errors: str - encoding error handling
    
    Returns:  
    bytes - contents as encoded markup
    """

Usage Examples:

html = '<div class="wrapper"><p>Content 1</p><p>Content 2</p></div>'
soup = BeautifulSoup(html, 'html.parser')

div = soup.find('div')

# Full element output
print(div.decode())
# <div class="wrapper"><p>Content 1</p><p>Content 2</p></div>

# Contents only (without wrapper div)
print(div.decode_contents())
# <p>Content 1</p><p>Content 2</p>

# Contents as bytes
contents_bytes = div.encode_contents('utf-8')
print(contents_bytes.decode('utf-8'))
# <p>Content 1</p><p>Content 2</p>

# Useful for template replacement
template = '<html><body>{content}</body></html>'
content = div.decode_contents()
final_html = template.format(content=content)

Entity Formatting

Control how special characters and entities are handled in output.

# Formatter options
formatters = {
    "minimal": "Escape only <, >, & and quotes in attributes",
    "html": "Use HTML entity names where possible", 
    "xml": "Use XML entities only (&lt;, &gt;, &amp;, &quot;, &apos;)",
    None: "No entity substitution",
    callable: "Custom formatter function"
}

# Custom formatter signature
def custom_formatter(string):
    """
    Custom entity substitution function.
    
    Parameters:
    - string: str - string to format
    
    Returns:
    str - formatted string
    """

Usage Examples:

from bs4 import BeautifulSoup
from bs4.dammit import EntitySubstitution

html = '<div title="Ben & Jerry\'s">Price: $5 < $10</div>'
soup = BeautifulSoup(html, 'html.parser')
div = soup.find('div')

# Minimal formatting (default)
print(div.encode(formatter="minimal").decode())
# <div title="Ben &amp; Jerry's">Price: $5 &lt; $10</div>

# HTML entity formatting
print(div.encode(formatter="html").decode())
# Uses HTML entity names where available

# XML entity formatting  
print(div.encode(formatter="xml").decode())
# <div title="Ben &amp; Jerry&apos;s">Price: $5 &lt; $10</div>

# No entity substitution
print(div.encode(formatter=None).decode())
# <div title="Ben & Jerry's">Price: $5 < $10</div>

# Custom formatter
def quote_formatter(s):
    return s.replace('"', '&quot;').replace("'", '&#x27;')

print(div.encode(formatter=quote_formatter).decode())

# Using EntitySubstitution directly
formatted = EntitySubstitution.substitute_html('Ben & Jerry\'s <script>')
print(formatted)  # Ben &amp; Jerry's &lt;script&gt;

Encoding Handling

Control character encoding in output with proper error handling.

# Encoding options
encoding_options = [
    "utf-8",      # Unicode encoding (default)
    "ascii",      # ASCII with entity fallback
    "latin-1",    # ISO 8859-1
    "cp1252",     # Windows encoding
    None          # Return Unicode string
]

# Error handling modes
error_modes = [
    "xmlcharrefreplace",  # Replace with XML entities (default)
    "strict",             # Raise exception on encoding errors
    "ignore",             # Skip unencodable characters
    "replace"             # Replace with ? character
]

Usage Examples:

html = '<div>Unicode: café, naïve, résumé</div>'
soup = BeautifulSoup(html, 'html.parser')
div = soup.find('div')

# UTF-8 encoding (handles all Unicode)
utf8 = div.encode('utf-8')
print(utf8.decode('utf-8'))  # café, naïve, résumé

# ASCII with XML character references
ascii_xml = div.encode('ascii', errors='xmlcharrefreplace')
print(ascii_xml.decode('ascii'))  # caf&#233;, na&#239;ve, r&#233;sum&#233;

# Latin-1 (handles some accented characters)
try:
    latin1 = div.encode('latin-1')
    print(latin1.decode('latin-1'))  # café, naïve, résumé
except UnicodeEncodeError:
    print("Some characters not encodable in Latin-1")

# Handle encoding errors
ascii_ignore = div.encode('ascii', errors='ignore')
print(ascii_ignore.decode('ascii'))  # caf, nave, rsum

ascii_replace = div.encode('ascii', errors='replace')  
print(ascii_replace.decode('ascii'))  # caf?, na?ve, r?sum?

XML Declaration Handling

Control XML declaration output for XML documents.

# XML-specific output features
def decode(self, eventual_encoding=DEFAULT_OUTPUT_ENCODING):
    """
    For XML documents, includes <?xml version="1.0" encoding="..."?> declaration.
    
    Parameters:
    - eventual_encoding: str - encoding to declare in XML header
    """

# XML declaration is automatically added for:
# - BeautifulSoup objects parsed with XML parser
# - When is_xml property is True

Usage Examples:

xml = '<root><item>content</item></root>'

# Parse as XML
xml_soup = BeautifulSoup(xml, 'xml')
print(xml_soup.decode())
# <?xml version="1.0" encoding="utf-8"?>
# <root><item>content</item></root>

# Specify encoding in declaration
print(xml_soup.decode(eventual_encoding='iso-8859-1'))
# <?xml version="1.0" encoding="iso-8859-1"?>
# <root><item>content</item></root>

# Parse as HTML (no XML declaration)
html_soup = BeautifulSoup(xml, 'html.parser')
print(html_soup.decode())
# <root><item>content</item></root>

Output Utilities

Helper functions and patterns for common output scenarios.

# Common output patterns

def save_to_file(soup, filename, encoding='utf-8'):
    """Save soup to file with proper encoding"""
    with open(filename, 'w', encoding=encoding) as f:
        f.write(soup.decode())

def get_text_content(element, separator=' '):
    """Extract clean text content"""
    return separator.join(element.stripped_strings)

def minify_html(soup):
    """Remove extra whitespace from HTML"""
    return str(soup).replace('\n', '').replace('  ', ' ')

Usage Examples:

import os

html = '''
<html>
  <head>
    <title>Sample Page</title>
  </head>
  <body>
    <h1>Main Title</h1>
    <p>Content paragraph with <em>emphasis</em>.</p>
  </body>
</html>
'''

soup = BeautifulSoup(html, 'html.parser')

# Save formatted HTML to file
with open('output.html', 'w', encoding='utf-8') as f:
    f.write(soup.prettify())

# Save minified HTML  
minified = str(soup).replace('\n', '').replace('  ', ' ')
with open('minified.html', 'w', encoding='utf-8') as f:
    f.write(minified)

# Extract and save text content only
text_content = soup.get_text('\n', strip=True)
with open('content.txt', 'w', encoding='utf-8') as f:
    f.write(text_content)

# Convert to different encodings
for encoding in ['utf-8', 'latin-1', 'ascii']:
    try:
        filename = f'output_{encoding}.html'
        with open(filename, 'wb') as f:
            f.write(soup.encode(encoding))
        print(f"Saved {filename}")
    except UnicodeEncodeError as e:
        print(f"Cannot encode as {encoding}: {e}")

# Clean up files
for f in ['output.html', 'minified.html', 'content.txt']:
    if os.path.exists(f):
        os.remove(f)

Install with Tessl CLI