tessl/pypi-beautifulsoup4

Python library for pulling data out of HTML and XML files with pluggable parser architecture and intuitive navigation API

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Element Search

Name: tessl/pypi-beautifulsoup4
Author: tessl

Find elements using tag names, attributes, text content, CSS selectors, and custom matching functions. Beautiful Soup provides flexible search capabilities with both single and multiple result options, supporting various criteria types for precise element selection.

Capabilities

Basic Search Methods

Find elements in the parse tree using tag names, attributes, and text content.

def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs):
    """
    Find the first element matching the given criteria.
    
    Parameters:
    - name: str, list, regex, callable, or True - tag name filter
    - attrs: dict - attribute filters
    - recursive: bool - search descendants (True) or direct children only (False)
    - text: str, list, regex, callable, or True - text content filter
    - **kwargs: attribute filters as keyword arguments
    
    Returns:
    PageElement or None if no match found
    """

def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs):
    """
    Find all elements matching the given criteria.
    
    Parameters:
    - name: str, list, regex, callable, or True - tag name filter
    - attrs: dict - attribute filters  
    - recursive: bool - search descendants (True) or direct children only (False)
    - text: str, list, regex, callable, or True - text content filter
    - limit: int - maximum number of results to return
    - **kwargs: attribute filters as keyword arguments
    
    Returns:
    ResultSet (list-like) of matching PageElement instances
    """

# Convenience method - equivalent to find_all
def __call__(self, *args, **kwargs):
    """Equivalent to find_all(*args, **kwargs)"""

Usage Examples:

from bs4 import BeautifulSoup
import re

html = '''
<html>
  <body>
    <div class="container">
      <p id="intro">Introduction text</p>
      <p class="content">Main content</p>
      <a href="http://example.com">External link</a>
      <a href="/internal">Internal link</a>
    </div>
  </body>  
</html>
'''

soup = BeautifulSoup(html, 'html.parser')

# Find by tag name
first_p = soup.find('p')
all_ps = soup.find_all('p')

# Find by attributes
intro = soup.find('p', id='intro')
content = soup.find('p', class_='content')
external_link = soup.find('a', href='http://example.com')

# Find with attribute dictionary
intro = soup.find('p', attrs={'id': 'intro'})

# Find by multiple attributes
# (no results in this example, but shows syntax)
result = soup.find('p', {'class': 'content', 'id': 'special'})

# Find with regex patterns
external_links = soup.find_all('a', href=re.compile(r'^http'))
internal_links = soup.find_all('a', href=re.compile(r'^/'))

# Find with callable
def has_class(tag):
    return tag.has_attr('class')

elements_with_class = soup.find_all(has_class)

# Limit results
first_two_links = soup.find_all('a', limit=2)

# Search direct children only
container = soup.find('div', class_='container')
direct_children = container.find_all('p', recursive=False)

CSS Selector Search

Use CSS selector syntax for complex element selection.

def select(self, selector):
    """
    Find all elements matching a CSS selector.
    
    Parameters:
    - selector: str - CSS selector string
    
    Returns:
    ResultSet of matching elements
    
    Supported selectors:
    - Tag names: 'p', 'div'
    - IDs: '#myid' 
    - Classes: '.myclass'
    - Attributes: '[href]', '[href="value"]'
    - Pseudo-classes: ':first-child', ':nth-of-type(n)'
    - Combinators: 'div > p', 'div p', 'div + p', 'div ~ p'
    """

# Note: select_one() method was added in later versions of Beautiful Soup
# For version 4.3.2, use select(selector)[0] or select(selector)[:1] for first match

Usage Examples:

html = '''
<div class="container">
  <h1 id="title">Page Title</h1>
  <div class="content">
    <p class="intro">Introduction</p>
    <p>Regular paragraph</p>
    <ul>
      <li>First item</li>
      <li class="special">Second item</li>
    </ul>
  </div>
</div>
'''

soup = BeautifulSoup(html, 'html.parser')

# Basic selectors
title_list = soup.select('#title')  # Returns list, use [0] for first element
title = title_list[0] if title_list else None
intro_list = soup.select('.intro')
intro = intro_list[0] if intro_list else None
all_paragraphs = soup.select('p')  # All p tags

# Attribute selectors
elements_with_class = soup.select('[class]')  # Has class attribute
special_items = soup.select('[class="special"]')  # Specific class value

# Descendant combinators
content_paragraphs = soup.select('div.content p')  # p descendants of div.content
direct_children = soup.select('div.content > p')  # p direct children of div.content

# Sibling combinators  
after_intro = soup.select('p.intro + p')  # p immediately after p.intro
all_after_intro = soup.select('p.intro ~ p')  # all p siblings after p.intro

# Pseudo-classes
first_li = soup.select('li:first-child')
second_li = soup.select('li:nth-of-type(2)')
last_p = soup.select('p:last-of-type')

# Complex selectors
special_in_content = soup.select('div.content .special')
nested_selection = soup.select('div.container > div > ul > li.special')

Directional Search Methods

Search in specific directions from the current element.

def find_next(self, name=None, attrs={}, text=None, **kwargs):
    """
    Find the next element in document order matching criteria.
    
    Returns:
    PageElement or None
    """

def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs):
    """
    Find all following elements in document order matching criteria.
    
    Returns:
    ResultSet of matching elements
    """

def find_previous(self, name=None, attrs={}, text=None, **kwargs):
    """
    Find the previous element in document order matching criteria.
    
    Returns:
    PageElement or None
    """

def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs):
    """
    Find all preceding elements in document order matching criteria.
    
    Returns:
    ResultSet of matching elements
    """

def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
    """
    Find the next sibling element matching criteria.
    
    Returns:
    PageElement or None
    """

def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs):
    """
    Find all following sibling elements matching criteria.
    
    Returns:
    ResultSet of matching elements
    """

def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
    """
    Find the previous sibling element matching criteria.
    
    Returns:
    PageElement or None
    """

def find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs):
    """
    Find all preceding sibling elements matching criteria.
    
    Returns:
    ResultSet of matching elements
    """

def find_parent(self, name=None, attrs={}, **kwargs):
    """
    Find the parent element matching criteria.
    
    Returns:
    PageElement or None
    """

def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
    """
    Find all ancestor elements matching criteria.
    
    Returns:
    ResultSet of matching elements
    """

Usage Examples:

html = '''
<article>
  <h1>Title</h1>
  <p>First paragraph</p>
  <div class="sidebar">Sidebar content</div>
  <p>Second paragraph</p>
  <footer>Footer</footer>
</article>
'''

soup = BeautifulSoup(html, 'html.parser')
first_p = soup.find('p')

# Find next elements
next_div = first_p.find_next('div')  # sidebar div
next_p = first_p.find_next('p')     # second paragraph
all_following = first_p.find_all_next()  # all elements after first p

# Find previous elements  
h1 = first_p.find_previous('h1')    # title
all_preceding = first_p.find_all_previous()  # h1 and title text

# Find siblings
next_sibling_div = first_p.find_next_sibling('div')  # sidebar
all_next_siblings = first_p.find_next_siblings()     # div, p, footer

# Find parents
article = first_p.find_parent('article')
all_parents = first_p.find_parents()  # article, then document root

Advanced Search Patterns

Complex search criteria using callables, regular expressions, and custom matching logic.

# Search criteria types
SearchCriteria = Union[
    str,           # Exact match
    list,          # Match any item in list
    re.Pattern,    # Regex pattern match
    callable,      # Custom function returning bool
    True,          # Match any (for text: any non-empty string)
    None           # No filter (match all)
]

Usage Examples:

import re
from bs4 import BeautifulSoup

html = '''
<div>
  <p class="intro summary">Introduction</p>
  <p class="content">Main content</p>  
  <a href="mailto:user@example.com">Email</a>
  <a href="http://example.com">Website</a>
  <span data-value="123">Data span</span>
</div>
'''

soup = BeautifulSoup(html, 'html.parser')

# List matching - multiple values
paragraphs = soup.find_all('p', class_=['intro', 'content'])

# Regex matching
email_links = soup.find_all('a', href=re.compile(r'^mailto:'))
data_elements = soup.find_all(attrs={'data-value': re.compile(r'\d+')})

# Callable matching
def has_multiple_classes(tag):
    return tag.has_attr('class') and len(tag['class']) > 1

multi_class_elements = soup.find_all(has_multiple_classes)

def is_external_link(tag):
    return (tag.name == 'a' and 
            tag.has_attr('href') and 
            tag['href'].startswith('http'))

external_links = soup.find_all(is_external_link)

# Text content search
elements_with_text = soup.find_all(text=True)  # All text nodes
intro_text = soup.find_all(text=re.compile(r'Intro'))  # Text containing 'Intro'

# Complex combined criteria
def complex_criteria(tag):
    return (tag.name in ['p', 'div'] and
            tag.has_attr('class') and
            'content' in tag.get('class', []))

matching_elements = soup.find_all(complex_criteria)

Search Utilities

Helper classes and functions for search operations.

class SoupStrainer:
    """Encapsulates search criteria for filtering elements during parsing."""
    
    def __init__(self, name=None, attrs={}, text=None, **kwargs):
        """
        Create search criteria for parsing or post-parse filtering.
        
        Parameters same as find() method
        """
    
    def search(self, markup):
        """Test if element matches criteria"""
    
    def search_tag(self, markup_name, markup_attrs):
        """Test if tag matches criteria"""

class ResultSet(list):
    """List subclass that tracks the search criteria used to generate results."""
    
    @property
    def source(self):
        """The SoupStrainer that generated these results"""

Usage Examples:

from bs4 import BeautifulSoup, SoupStrainer

# Use SoupStrainer to limit parsing
only_links = SoupStrainer('a')
soup = BeautifulSoup(html, 'html.parser', parse_only=only_links)

# ResultSet provides search context
results = soup.find_all('p')
print(type(results))  # <class 'bs4.element.ResultSet'>
print(results.source)  # Shows the SoupStrainer used

Backward Compatibility

Legacy search methods from BeautifulSoup 3.x.

# BeautifulSoup 3.x compatibility
def findAll(self, *args, **kwargs):  # Use find_all instead
    """Deprecated: use find_all"""

def findNext(self, *args, **kwargs):  # Use find_next instead  
    """Deprecated: use find_next"""

Install with Tessl CLI