A modern CSS selector implementation for Beautiful Soup.
npx @tessl/cli install tessl/pypi-soupsieve@1.9.0A modern CSS selector implementation for Beautiful Soup 4, providing comprehensive CSS selector support from CSS Level 1 through CSS Level 4 drafts. Soupsieve serves as the default selector engine for Beautiful Soup 4.7.0+ and can be used independently for sophisticated CSS-based element selection from HTML/XML documents.
pip install soupsieveimport soupsieveAlternative import for shorter syntax:
import soupsieve as svSpecific functions and classes can be imported directly:
from soupsieve import compile, select, match, SoupSieve, SelectorSyntaxErrorimport soupsieve as sv
from bs4 import BeautifulSoup
# Create a soup object from HTML
html = """
<div class="container">
<p id="intro">Introduction paragraph</p>
<div class="content">
<p class="highlight">Important content</p>
<span>Additional info</span>
</div>
</div>
"""
soup = BeautifulSoup(html, 'html.parser')
# Basic selection - find all paragraphs
paragraphs = sv.select('p', soup)
print(f"Found {len(paragraphs)} paragraphs")
# Select with class
highlighted = sv.select('.highlight', soup)
if highlighted:
print(f"Highlighted text: {highlighted[0].get_text()}")
# Select first match only
first_p = sv.select_one('p', soup)
print(f"First paragraph: {first_p.get_text()}")
# Test if element matches selector
intro = soup.find(id='intro')
if sv.match('#intro', intro):
print("Element matches #intro selector")
# Compiled selectors for reuse
compiled = sv.compile('div.content > *')
children = compiled.select(soup)
print(f"Found {len(children)} direct children of .content")Soupsieve's architecture centers around CSS parsing and matching:
The library automatically handles HTML vs XML differences and provides namespace support for XML documents.
Core functions for selecting elements using CSS selectors. These provide the primary interface for CSS-based element selection.
def select(select, tag, namespaces=None, limit=0, flags=0, **kwargs):
"""
Select all matching elements under the specified tag.
Parameters:
- select: str, CSS selector string
- tag: BeautifulSoup Tag or document to search within
- namespaces: dict, optional namespace mappings for XML
- limit: int, maximum results to return (0 = unlimited)
- flags: int, selection flags for advanced options
- **kwargs: additional options including 'custom' selectors
Returns:
List of matching BeautifulSoup Tag objects
"""
def select_one(select, tag, namespaces=None, flags=0, **kwargs):
"""
Select the first matching element.
Parameters:
- select: str, CSS selector string
- tag: BeautifulSoup Tag or document to search within
- namespaces: dict, optional namespace mappings for XML
- flags: int, selection flags for advanced options
- **kwargs: additional options including 'custom' selectors
Returns:
First matching BeautifulSoup Tag object or None
"""
def iselect(select, tag, namespaces=None, limit=0, flags=0, **kwargs):
"""
Iterate over matching elements (generator).
Parameters:
- select: str, CSS selector string
- tag: BeautifulSoup Tag or document to search within
- namespaces: dict, optional namespace mappings for XML
- limit: int, maximum results to yield (0 = unlimited)
- flags: int, selection flags for advanced options
- **kwargs: additional options including 'custom' selectors
Yields:
BeautifulSoup Tag objects that match the selector
"""Functions for testing individual elements and filtering collections.
def match(select, tag, namespaces=None, flags=0, **kwargs):
"""
Test if a tag matches the CSS selector.
Parameters:
- select: str, CSS selector string
- tag: BeautifulSoup Tag to test
- namespaces: dict, optional namespace mappings for XML
- flags: int, matching flags for advanced options
- **kwargs: additional options including 'custom' selectors
Returns:
bool, True if tag matches selector, False otherwise
"""
def filter(select, iterable, namespaces=None, flags=0, **kwargs):
"""
Filter a collection of tags by CSS selector.
Parameters:
- select: str, CSS selector string
- iterable: collection of BeautifulSoup Tags to filter
- namespaces: dict, optional namespace mappings for XML
- flags: int, filtering flags for advanced options
- **kwargs: additional options including 'custom' selectors
Returns:
List of Tags from iterable that match the selector
"""
def closest(select, tag, namespaces=None, flags=0, **kwargs):
"""
Find the closest matching ancestor element.
Parameters:
- select: str, CSS selector string
- tag: BeautifulSoup Tag to start ancestor search from
- namespaces: dict, optional namespace mappings for XML
- flags: int, matching flags for advanced options
- **kwargs: additional options including 'custom' selectors
Returns:
Closest ancestor Tag that matches selector or None
"""Functions for compiling selectors for reuse and managing the selector cache.
def compile(pattern, namespaces=None, flags=0, **kwargs):
"""
Compile CSS selector pattern into reusable SoupSieve object.
Parameters:
- pattern: str or SoupSieve, CSS selector string to compile
- namespaces: dict, optional namespace mappings for XML
- flags: int, compilation flags for advanced options
- **kwargs: additional options including 'custom' selectors
Returns:
SoupSieve compiled selector object
Raises:
ValueError: if flags/namespaces/custom provided with SoupSieve input
SelectorSyntaxError: for invalid CSS selector syntax
"""
def purge():
"""
Clear the internal compiled selector cache.
Returns:
None
"""Helper functions for CSS identifier escaping.
def escape(ident):
"""
Escape CSS identifier for safe use in selectors.
Parameters:
- ident: str, identifier string to escape
Returns:
str, CSS-escaped identifier safe for use in selectors
"""Functions for extracting comments (deprecated, will be removed in future versions).
def comments(tag, limit=0, flags=0, **kwargs):
"""
Extract comments from tag tree [DEPRECATED].
Parameters:
- tag: BeautifulSoup Tag to search for comments
- limit: int, maximum comments to return (0 = unlimited)
- flags: int, unused flags parameter
- **kwargs: additional unused options
Returns:
List of comment strings
Note: Deprecated - not related to CSS selectors, will be removed
"""
def icomments(tag, limit=0, flags=0, **kwargs):
"""
Iterate comments from tag tree [DEPRECATED].
Parameters:
- tag: BeautifulSoup Tag to search for comments
- limit: int, maximum comments to yield (0 = unlimited)
- flags: int, unused flags parameter
- **kwargs: additional unused options
Yields:
Comment strings
Note: Deprecated - not related to CSS selectors, will be removed
"""The main compiled selector class providing reusable CSS selector functionality with caching benefits.
class SoupSieve:
"""
Compiled CSS selector object for efficient reuse.
Attributes:
- pattern: str, original CSS selector pattern
- selectors: internal parsed selector structure
- namespaces: namespace mappings used during compilation
- custom: custom selector definitions used during compilation
- flags: compilation flags used during compilation
"""
def match(self, tag):
"""
Test if tag matches this compiled selector.
Parameters:
- tag: BeautifulSoup Tag to test
Returns:
bool, True if tag matches, False otherwise
"""
def select(self, tag, limit=0):
"""
Select all matching elements under tag using this compiled selector.
Parameters:
- tag: BeautifulSoup Tag or document to search within
- limit: int, maximum results to return (0 = unlimited)
Returns:
List of matching BeautifulSoup Tag objects
"""
def select_one(self, tag):
"""
Select first matching element using this compiled selector.
Parameters:
- tag: BeautifulSoup Tag or document to search within
Returns:
First matching BeautifulSoup Tag object or None
"""
def iselect(self, tag, limit=0):
"""
Iterate matching elements using this compiled selector.
Parameters:
- tag: BeautifulSoup Tag or document to search within
- limit: int, maximum results to yield (0 = unlimited)
Yields:
BeautifulSoup Tag objects that match the selector
"""
def filter(self, iterable):
"""
Filter collection of tags using this compiled selector.
Parameters:
- iterable: collection of BeautifulSoup Tags to filter
Returns:
List of Tags from iterable that match this selector
"""
def closest(self, tag):
"""
Find closest matching ancestor using this compiled selector.
Parameters:
- tag: BeautifulSoup Tag to start ancestor search from
Returns:
Closest ancestor Tag that matches this selector or None
"""
def comments(self, tag, limit=0):
"""
Extract comments using this selector [DEPRECATED].
Parameters:
- tag: BeautifulSoup Tag to search for comments
- limit: int, maximum comments to return (0 = unlimited)
Returns:
List of comment strings
Note: Deprecated - will be removed in future versions
"""
def icomments(self, tag, limit=0):
"""
Iterate comments using this selector [DEPRECATED].
Parameters:
- tag: BeautifulSoup Tag to search for comments
- limit: int, maximum comments to yield (0 = unlimited)
Yields:
Comment strings
Note: Deprecated - will be removed in future versions
"""Exception types raised by soupsieve for error conditions.
class SelectorSyntaxError(SyntaxError):
"""
Exception raised for invalid CSS selector syntax.
Attributes:
- line: int, line number of syntax error (if available)
- col: int, column number of syntax error (if available)
- context: str, pattern context showing error location (if available)
"""
def __init__(self, msg, pattern=None, index=None):
"""
Initialize syntax error with optional location information.
Parameters:
- msg: str, error message
- pattern: str, CSS pattern that caused error (optional)
- index: int, character index of error in pattern (optional)
"""DEBUG = 0x00001 # Debug flag constant for development and testing# Namespace dictionary for XML documents
Namespaces = dict[str, str]
# Example: {'html': 'http://www.w3.org/1999/xhtml', 'svg': 'http://www.w3.org/2000/svg'}
# Custom selector definitions
CustomSelectors = dict[str, str]
# Example: {'my-selector': 'div.custom-class', 'important': '.highlight.critical'}import soupsieve as sv
from bs4 import BeautifulSoup
xml_content = '''
<root xmlns:html="http://www.w3.org/1999/xhtml">
<html:div class="content">
<html:p>Namespaced paragraph</html:p>
</html:div>
</root>
'''
soup = BeautifulSoup(xml_content, 'xml')
namespaces = {'html': 'http://www.w3.org/1999/xhtml'}
# Select namespaced elements
divs = sv.select('html|div', soup, namespaces=namespaces)
paragraphs = sv.select('html|p', soup, namespaces=namespaces)import soupsieve as sv
from bs4 import BeautifulSoup
html = '<div class="important highlight">Content</div><p class="note">Note</p>'
soup = BeautifulSoup(html, 'html.parser')
# Define custom selectors
custom = {
'special': '.important.highlight',
'content': 'div, p'
}
# Use custom selectors
special_divs = sv.select(':special', soup, custom=custom)
content_elements = sv.select(':content', soup, custom=custom)import soupsieve as sv
from bs4 import BeautifulSoup
# Compile once, use many times for better performance
complex_selector = sv.compile('div.container > p:nth-child(odd):not(.excluded)')
# Use compiled selector on multiple documents
for html_content in document_list:
soup = BeautifulSoup(html_content, 'html.parser')
matches = complex_selector.select(soup)
process_matches(matches)
# Clear cache when done with heavy selector use
sv.purge()import soupsieve as sv
from soupsieve import SelectorSyntaxError
from bs4 import BeautifulSoup
soup = BeautifulSoup('<div>content</div>', 'html.parser')
try:
# This will raise SelectorSyntaxError due to invalid CSS
results = sv.select('div[invalid-syntax', soup)
except SelectorSyntaxError as e:
print(f"CSS selector error: {e}")
if e.line and e.col:
print(f"Error at line {e.line}, column {e.col}")
try:
# This will raise TypeError for invalid tag input
results = sv.select('div', "not a tag object")
except TypeError as e:
print(f"Invalid input type: {e}")