tessl/pypi-beautifulsoup4

Python library for pulling data out of HTML and XML files with pluggable parser architecture and intuitive navigation API

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Tree Modification

Name: tessl/pypi-beautifulsoup4
Author: tessl

Modify the parse tree by inserting, removing, replacing elements and their attributes with automatic relationship maintenance. Beautiful Soup ensures tree integrity when making changes and provides memory management through explicit cleanup methods.

Capabilities

Element Removal

Remove elements from the parse tree with optional memory cleanup.

def extract(self):
    """
    Remove this element from the tree and return it.
    
    The element can be reinserted elsewhere. All parent/sibling 
    relationships are updated automatically.
    
    Returns:
    The extracted element (self)
    """

def decompose(self):
    """
    Recursively destroy this element and its children to free memory.
    
    Use when you're done with an element and want to reclaim memory.
    The element becomes unusable after decomposition.
    
    Returns:
    None
    """

def clear(self, decompose=False):
    """
    Remove all children from this element.
    
    Parameters:
    - decompose: bool - if True, decompose children to free memory
    
    Returns:
    None
    """

Usage Examples:

from bs4 import BeautifulSoup

html = '''
<div class="container">
  <p>Keep this paragraph</p>
  <div class="unwanted">Remove this div</div>
  <p>Keep this paragraph too</p>
</div>
'''

soup = BeautifulSoup(html, 'html.parser')

# Extract element for reuse elsewhere
unwanted = soup.find('div', class_='unwanted')
extracted = unwanted.extract()  # Removed from tree but still usable

# Can insert extracted element elsewhere
new_location = soup.new_tag('section')
new_location.append(extracted)

# Decompose to free memory permanently
ad_elements = soup.find_all('div', class_='advertisement')
for ad in ad_elements:
    ad.decompose()  # Memory freed, element unusable

# Clear all children
container = soup.find('div', class_='container')
container.clear()  # Now empty div

# Clear with memory cleanup
container.clear(decompose=True)  # Children decomposed

Element Replacement

Replace elements in the parse tree with new content.

def replace_with(self, *args):
    """
    Replace this element with one or more new elements.
    
    Parameters:
    - *args: PageElement instances or strings to replace with
    
    Returns:
    The replaced element (self)
    """

def wrap(self, wrap_inside):
    """
    Wrap this element inside another element.
    
    Parameters:
    - wrap_inside: PageElement (usually Tag) to wrap this element in
    
    Returns:
    The wrapping element
    """

def unwrap(self):
    """
    Replace this element with its children.
    
    Useful for removing a wrapper tag but keeping its contents.
    Only works on Tag elements with children.
    
    Returns:
    The unwrapped element (self)
    """

Usage Examples:

html = '''
<div>
  <p>Old paragraph</p>
  <span>Text to wrap</span>
  <em>Remove emphasis but keep text</em>
</div>
'''

soup = BeautifulSoup(html, 'html.parser')

# Replace element with new content
old_p = soup.find('p')
new_p = soup.new_tag('p', class_='updated')
new_p.string = 'New paragraph content'
old_p.replace_with(new_p)

# Replace with multiple elements
span = soup.find('span')
new_strong = soup.new_tag('strong')
new_strong.string = 'Bold text'
new_text = soup.new_string(' and plain text')
span.replace_with(new_strong, new_text)

# Wrap element in new tag
text_span = soup.find('span')  # If it still exists
wrapper_div = soup.new_tag('div', class_='wrapper')
if text_span:
    text_span.wrap(wrapper_div)

# Unwrap element (remove tag but keep contents)
em = soup.find('em')
if em:
    em.unwrap()  # <em>Remove emphasis but keep text</em> becomes just the text

Element Insertion

Insert new elements at specific positions in the parse tree.

def insert(self, position, new_child):
    """
    Insert a new child at the specified position.
    
    Parameters:
    - position: int - index position (0 = first child)
    - new_child: PageElement or string to insert
    
    Returns:
    None
    """

def insert_before(self, *args):
    """
    Insert one or more elements immediately before this element.
    
    Parameters:
    - *args: PageElement instances or strings to insert
    
    Returns:
    None
    """

def insert_after(self, *args):
    """
    Insert one or more elements immediately after this element.
    
    Parameters:
    - *args: PageElement instances or strings to insert
    
    Returns:
    None
    """

def append(self, tag):
    """
    Add an element as the last child of this element.
    
    Parameters:
    - tag: PageElement or string to append
    
    Returns:
    None
    """

def extend(self, tags):
    """
    Add multiple elements as children of this element.
    
    Parameters:
    - tags: iterable of PageElement instances or strings
    
    Returns:
    None
    """

def index(self, element):
    """
    Find the index of a child element by identity.
    
    Avoids issues with tag.contents.index(element) when there are
    equal elements, using identity comparison instead of value.
    
    Parameters:
    - element: PageElement - child element to find
    
    Returns:
    int - index of element in contents list
    
    Raises:
    ValueError if element is not a child of this element
    """

Usage Examples:

html = '<div><p>Existing paragraph</p></div>'
soup = BeautifulSoup(html, 'html.parser')

div = soup.find('div')
existing_p = soup.find('p')

# Insert at specific position
new_h1 = soup.new_tag('h1')
new_h1.string = 'Title'
div.insert(0, new_h1)  # Insert as first child

# Insert before/after existing elements
before_p = soup.new_tag('p', class_='intro')
before_p.string = 'Introduction'
existing_p.insert_before(before_p)

after_p = soup.new_tag('p', class_='conclusion')
after_p.string = 'Conclusion'
existing_p.insert_after(after_p)

# Append to end
footer = soup.new_tag('footer')
footer.string = 'Footer content'
div.append(footer)

# Extend with multiple elements
new_elements = []
for i in range(3):
    item = soup.new_tag('span', class_='item')
    item.string = f'Item {i+1}'
    new_elements.append(item)

div.extend(new_elements)

# Insert text content
div.insert(1, 'Some plain text')
existing_p.insert_after('Text after paragraph')

# Find element index
p_index = div.index(existing_p)
print(f"Paragraph is at index {p_index}")

# Safer than contents.index() for duplicate elements
h1_index = div.index(new_h1)  # Uses identity, not equality

Attribute Modification

Modify element attributes using dictionary-like operations.

def __getitem__(self, key):
    """Get attribute value like a dictionary"""

def __setitem__(self, key, value):
    """Set attribute value like a dictionary"""

def __delitem__(self, key):
    """Delete attribute like a dictionary"""

def get(self, key, default=None):
    """
    Get attribute value with optional default.
    
    Parameters:
    - key: str - attribute name
    - default: value to return if attribute doesn't exist
    
    Returns:
    Attribute value or default
    """

def has_attr(self, key):
    """
    Check if element has the specified attribute.
    
    Parameters:
    - key: str - attribute name
    
    Returns:
    bool
    """

@property
def attrs(self):
    """
    Dictionary of all attributes.
    
    Returns:
    dict - can be modified directly
    """

Usage Examples:

html = '<div class="container" id="main">Content</div>'
soup = BeautifulSoup(html, 'html.parser')

div = soup.find('div')

# Get attributes
print(div['class'])  # ['container']
print(div['id'])     # 'main'
print(div.get('data-value', 'default'))  # 'default'

# Set attributes
div['class'] = ['container', 'updated']
div['data-value'] = '123'
div['title'] = 'Tooltip text'

# Delete attributes
del div['id']

# Check attribute existence
if div.has_attr('class'):
    print('Has class attribute')

# Modify attrs dictionary directly
div.attrs['style'] = 'color: red;'
div.attrs.update({'data-count': '5', 'role': 'main'})

# Special handling for class attribute (list vs string)
div['class'] = 'single-class'    # Becomes ['single-class']
div['class'] = ['a', 'b', 'c']   # Stays as list

Content Modification

Modify the text content and children of elements.

@property
def string(self):
    """
    Get/set the string content of this element.
    
    Get: Returns single NavigableString if element has only one string child,
         otherwise None
    Set: Replaces all children with a single NavigableString
    
    Returns:
    NavigableString or None
    """

@string.setter  
def string(self, value):
    """Replace all children with a single string"""

def get_text(self, separator="", strip=False, types=(NavigableString,)):
    """
    Extract all text content from this element and its descendants.
    
    Parameters:
    - separator: str - join multiple text pieces with this separator
    - strip: bool - strip whitespace from each text piece
    - types: tuple - which NavigableString types to include
    
    Returns:
    str - concatenated text content
    """

Usage Examples:

html = '''
<div>
  <p>Original text</p>
  <span>More <em>emphasis</em> text</span>
</div>
'''

soup = BeautifulSoup(html, 'html.parser')

# Modify string content
p = soup.find('p')
p.string = 'Updated text'  # Replaces all children

# Get text content
span = soup.find('span')
print(span.get_text())  # 'More emphasis text'
print(span.get_text(' | '))  # 'More | emphasis | text'
print(span.get_text(strip=True))  # Strips whitespace

# Work with mixed content
div = soup.find('div')
all_text = div.get_text(' ')
print(all_text)  # All text from div and descendants

# Preserve only certain text types
from bs4 import NavigableString, Comment
text_only = div.get_text(types=(NavigableString,))  # Excludes comments

Memory Management

Control memory usage when working with large documents.

def decompose(self):
    """
    Recursively destroy this element and free memory.
    
    After decomposition, the element and its children become
    unusable. Use when processing large documents to prevent
    memory accumulation.
    """

# Context manager for automatic cleanup
class SoupProcessor:
    def __init__(self, markup, parser):
        self.soup = BeautifulSoup(markup, parser)
    
    def __enter__(self):
        return self.soup
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.soup.decompose()

Usage Examples:

# Manual memory management
large_html = get_large_html_document()
soup = BeautifulSoup(large_html, 'lxml')

# Process elements and clean up as you go
for section in soup.find_all('section', class_='processed'):
    process_section(section)
    section.decompose()  # Free memory immediately

# Clean up entire soup when done
soup.decompose()

# Context manager pattern for automatic cleanup
with SoupProcessor(html_content, 'html.parser') as soup:
    results = extract_data(soup)
    # soup automatically decomposed when exiting context

Install with Tessl CLI