tessl/pypi-beautifulsoup4

Python library for pulling data out of HTML and XML files with pluggable parser architecture and intuitive navigation API

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Tree Navigation

Name: tessl/pypi-beautifulsoup4
Author: tessl

Navigate through the parse tree using parent-child relationships, sibling traversal, and document-order iteration. Beautiful Soup provides both direct property access and generator-based iteration for memory-efficient traversal of large documents.

Capabilities

Parent-Child Navigation

Navigate up and down the parse tree hierarchy using parent and children relationships.

@property
def parent(self):
    """
    The parent element of this element, or None if this is the root.
    
    Returns:
    PageElement or None
    """

@property  
def contents(self):
    """
    List of direct children of this element.
    
    Returns:
    list of PageElement instances
    """

@property
def children(self):
    """
    Generator yielding direct children of this element.
    
    Yields:
    PageElement instances
    """

@property
def descendants(self):
    """
    Generator yielding all descendant elements in document order.
    
    Yields:
    PageElement instances (tags and strings)
    """

@property
def parents(self):
    """
    Generator yielding all parent elements up to the document root.
    
    Yields:  
    PageElement instances
    """

Usage Examples:

from bs4 import BeautifulSoup

html = '''
<html>
  <body>
    <div class="container">
      <p>First paragraph</p>
      <p>Second paragraph</p>
    </div>
  </body>
</html>
'''

soup = BeautifulSoup(html, 'html.parser')
div = soup.find('div')
first_p = soup.find('p')

# Parent access
print(div.parent.name)  # 'body'
print(first_p.parent.name)  # 'div'

# Children access  
print(len(div.contents))  # 5 (includes whitespace text nodes)
print([child.name for child in div.children if child.name])  # ['p', 'p']

# Descendants - all elements below
for element in div.descendants:
    if hasattr(element, 'name') and element.name:
        print(element.name)  # div, p, p

# Parents - up to root
for parent in first_p.parents:
    if parent.name:
        print(parent.name)  # div, body, html

Sibling Navigation

Navigate horizontally through elements at the same level in the parse tree.

@property
def next_sibling(self):
    """
    The next sibling element, or None if this is the last child.
    
    Returns:
    PageElement or None
    """

@property
def previous_sibling(self):
    """
    The previous sibling element, or None if this is the first child.
    
    Returns:
    PageElement or None
    """

@property
def next_siblings(self):
    """
    Generator yielding all following sibling elements.
    
    Yields:
    PageElement instances
    """

@property
def previous_siblings(self):
    """
    Generator yielding all preceding sibling elements in reverse order.
    
    Yields:
    PageElement instances  
    """

Usage Examples:

html = '<div><p>One</p><p>Two</p><p>Three</p></div>'
soup = BeautifulSoup(html, 'html.parser')

first_p = soup.find('p')
second_p = first_p.next_sibling.next_sibling  # Skip whitespace text node
third_p = soup.find_all('p')[2]

# Direct sibling access
print(second_p.previous_sibling.previous_sibling.string)  # 'One'
print(second_p.next_sibling.next_sibling.string)  # 'Three'

# Iterate through siblings
for sibling in first_p.next_siblings:
    if hasattr(sibling, 'name') and sibling.name == 'p':
        print(sibling.string)  # 'Two', 'Three'

for sibling in third_p.previous_siblings:
    if hasattr(sibling, 'name') and sibling.name == 'p':
        print(sibling.string)  # 'Two', 'One'

Document-Order Navigation

Navigate through elements in the order they appear in the source document.

@property
def next_element(self):
    """
    The next element in document order, or None if this is the last.
    
    Returns:
    PageElement or None
    """

@property
def previous_element(self):
    """
    The previous element in document order, or None if this is the first.
    
    Returns:
    PageElement or None
    """

@property
def next_elements(self):
    """
    Generator yielding all following elements in document order.
    
    Yields:
    PageElement instances (tags and strings)
    """

@property
def previous_elements(self):
    """
    Generator yielding all preceding elements in reverse document order.
    
    Yields:
    PageElement instances (tags and strings)
    """

Usage Examples:

html = '<div><p>Para <em>emphasis</em> text</p><span>After</span></div>'
soup = BeautifulSoup(html, 'html.parser')

p_tag = soup.find('p')
em_tag = soup.find('em')

# Document order navigation
current = p_tag
while current:
    if hasattr(current, 'name') and current.name:
        print(f"Tag: {current.name}")
    elif isinstance(current, str) and current.strip():
        print(f"Text: {current.strip()}")
    current = current.next_element

# Output: Tag: p, Text: Para, Tag: em, Text: emphasis, Text: text, Tag: span, Text: After

# Find all text in document order from a starting point
text_content = []
for element in em_tag.next_elements:
    if isinstance(element, str) and element.strip():
        text_content.append(element.strip())
    if hasattr(element, 'name') and element.name == 'span':
        break
print(text_content)  # ['text', 'After']

Navigation Utilities

Helper methods for common navigation patterns.

def index(self, element):
    """
    Get the index of a child element.
    
    Parameters:
    - element: PageElement to find
    
    Returns:
    int, index of element in contents list
    
    Raises:
    ValueError if element is not a child
    """

@property
def is_empty_element(self):
    """
    True if this tag has no contents and can be rendered as self-closing.
    
    Returns:
    bool
    """

Usage Examples:

html = '<ul><li>First</li><li>Second</li><li>Third</li></ul>'
soup = BeautifulSoup(html, 'html.parser')

ul = soup.find('ul')
second_li = soup.find_all('li')[1]

# Get child index
print(ul.index(second_li))  # Index position of second <li>

# Check if element is empty
empty_div = soup.new_tag('div')
print(empty_div.is_empty_element)  # True

div_with_content = soup.new_tag('div')
div_with_content.string = 'Content'
print(div_with_content.is_empty_element)  # False

Backward Compatibility

Legacy navigation methods from BeautifulSoup 3.x are aliased for compatibility.

# BeautifulSoup 3.x compatibility aliases
@property
def nextSibling(self):  # Use next_sibling instead
    """Deprecated: use next_sibling"""

@property  
def previousSibling(self):  # Use previous_sibling instead
    """Deprecated: use previous_sibling"""

@property
def findNextSibling(self):  # Use find_next_sibling instead  
    """Deprecated: use find_next_sibling"""

@property
def findPreviousSibling(self):  # Use find_previous_sibling instead
    """Deprecated: use find_previous_sibling"""

Install with Tessl CLI