tessl/pypi-selectolax

Fast HTML5 parser with CSS selectors using Modest and Lexbor engines

—

Pending

Overview

Eval results

Files

HTML Parsing with Modest Engine

Name: tessl/pypi-selectolax
Author: tessl

The primary HTML5 parser using the Modest engine. Provides comprehensive parsing capabilities with automatic encoding detection, CSS selector support, and DOM manipulation methods for extracting and modifying HTML content.

Capabilities

HTMLParser Class

Main parser class that handles HTML document parsing with automatic encoding detection and provides access to the parsed DOM tree.

class HTMLParser:
    def __init__(
        self, 
        html: str | bytes, 
        detect_encoding: bool = True, 
        use_meta_tags: bool = True, 
        decode_errors: str = 'ignore'
    ):
        """
        Initialize HTML parser with content.
        
        Parameters:
        - html: HTML content as string or bytes
        - detect_encoding: Auto-detect encoding for bytes input
        - use_meta_tags: Use HTML meta tags for encoding detection
        - decode_errors: Error handling ('ignore', 'strict', 'replace')
        """

Usage Example:

from selectolax.parser import HTMLParser

# Parse from string
parser = HTMLParser('<div>Hello <strong>world</strong>!</div>')

# Parse from bytes with encoding detection
html_bytes = b'<div>Caf\xe9</div>'
parser = HTMLParser(html_bytes, detect_encoding=True)

# Parse with strict error handling
parser = HTMLParser(html_content, decode_errors='strict')

CSS Selector Methods

Query the DOM tree using CSS selectors to find matching elements.

def css(self, query: str) -> list[Node]:
    """
    Find all elements matching CSS selector.
    
    Parameters:
    - query: CSS selector string
    
    Returns:
    List of Node objects matching the selector
    """

def css_first(self, query: str, default=None, strict: bool = False) -> Node | None:
    """
    Find first element matching CSS selector.
    
    Parameters:
    - query: CSS selector string
    - default: Value to return if no match found
    - strict: If True, raise error when multiple matches exist
    
    Returns:
    First matching Node object or default value
    """

Usage Example:

# Find all paragraphs
paragraphs = parser.css('p')

# Find first heading with class
heading = parser.css_first('h1.title')

# Find with default value
nav = parser.css_first('nav', default=None)

# Strict mode - error if multiple matches
unique_element = parser.css_first('#unique-id', strict=True)

# Complex selectors
items = parser.css('div.content > ul li:nth-child(odd)')

Tag-Based Selection

Select elements by tag name for simple element retrieval.

def tags(self, name: str) -> list[Node]:
    """
    Find all elements with specified tag name.
    
    Parameters:
    - name: HTML tag name (e.g., 'div', 'p', 'a')
    
    Returns:
    List of Node objects with matching tag name
    """

Usage Example:

# Get all links
links = parser.tags('a')

# Get all images
images = parser.tags('img')

# Get all divs
divs = parser.tags('div')

Text Extraction

Extract text content from the parsed document.

def text(self, deep: bool = True, separator: str = '', strip: bool = False) -> str:
    """
    Extract text content from document body.
    
    Parameters:
    - deep: Include text from child elements
    - separator: String to join text from different elements
    - strip: Apply str.strip() to each text part
    
    Returns:
    Extracted text content as string
    """

Usage Example:

# Get all text content
all_text = parser.text()

# Get text with custom separator
spaced_text = parser.text(separator=' | ')

# Get cleaned text
clean_text = parser.text(strip=True)

# Get only direct text (no children)
direct_text = parser.text(deep=False)

DOM Tree Access

Access key parts of the HTML document structure.

@property
def root(self) -> Node | None:
    """Returns root HTML element node."""

@property  
def head(self) -> Node | None:
    """Returns HTML head element node."""

@property
def body(self) -> Node | None:
    """Returns HTML body element node."""

@property
def input_encoding(self) -> str:
    """Returns detected/used character encoding."""

@property
def raw_html(self) -> bytes:
    """Returns raw HTML bytes used for parsing."""

@property
def html(self) -> str | None:
    """Returns HTML representation of the entire document."""

Usage Example:

# Access document structure
root = parser.root
head = parser.head
body = parser.body

# Check encoding
encoding = parser.input_encoding  # e.g., 'UTF-8'

# Get original bytes
original = parser.raw_html

DOM Manipulation

Modify the HTML document structure by removing unwanted elements.

def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
    """
    Remove specified tags from document.
    
    Parameters:
    - tags: List of tag names to remove
    - recursive: Remove all child nodes with the tag
    """

def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
    """
    Remove tag wrappers while keeping content.
    
    Parameters:
    - tags: List of tag names to unwrap
    - delete_empty: Remove empty tags after unwrapping
    """

Usage Example:

# Remove script and style tags
parser.strip_tags(['script', 'style', 'noscript'])

# Remove tags recursively (including children)
parser.strip_tags(['iframe', 'object'], recursive=True)

# Unwrap formatting tags while keeping text
parser.unwrap_tags(['b', 'i', 'strong', 'em'])

# Clean up empty tags after unwrapping
parser.unwrap_tags(['span', 'div'], delete_empty=True)

Advanced Selection and Matching

Additional methods for advanced element selection and content matching.

def select(self, query: str = None) -> Selector:
    """
    Create advanced selector object with chaining support.
    
    Parameters:
    - query: Optional initial CSS selector
    
    Returns:
    Selector object supporting method chaining and filtering
    """

def any_css_matches(self, selectors: tuple[str, ...]) -> bool:
    """
    Check if any CSS selectors match elements in document.
    
    Parameters:
    - selectors: Tuple of CSS selector strings
    
    Returns:
    True if any selector matches elements, False otherwise
    """

def scripts_contain(self, query: str) -> bool:
    """
    Check if any script tag contains specified text.
    
    Caches script tags on first call for performance.
    
    Parameters:
    - query: Text to search for in script content
    
    Returns:
    True if any script contains the text, False otherwise
    """

def script_srcs_contain(self, queries: tuple[str, ...]) -> bool:
    """
    Check if any script src attribute contains specified text.
    
    Caches values on first call for performance.
    
    Parameters:
    - queries: Tuple of text strings to search for in src attributes
    
    Returns:
    True if any script src contains any query text, False otherwise
    """

Usage Example:

# Advanced selector with chaining
advanced_selector = parser.select('div.content')
# Further operations can be chained on the selector

# Check for CSS matches across document
important_selectors = ('.error', '.warning', '.critical')
has_important = parser.any_css_matches(important_selectors)

# Script content analysis
has_analytics = parser.scripts_contain('google-analytics')
has_tracking = parser.scripts_contain('facebook')

# Script source analysis
ad_scripts = ('ads.js', 'doubleclick', 'adsystem')
has_ads = parser.script_srcs_contain(ad_scripts)

# Content filtering based on scripts
if has_analytics or has_ads:
    print("Page contains tracking or ads")
    # Remove or flag for privacy

### Utility Functions

Additional utility functions for HTML element creation and parsing.

```python { .api }
def create_tag(tag: str) -> Node:
    """
    Create a new HTML element with specified tag name.
    
    Parameters:
    - tag: HTML tag name (e.g., 'div', 'p', 'img')
    
    Returns:
    New Node element with the specified tag
    """

def parse_fragment(html: str) -> list[Node]:
    """
    Parse HTML fragment into list of nodes without adding wrapper elements.
    
    Unlike HTMLParser which adds missing html/head/body tags, this function
    returns nodes exactly as specified in the HTML fragment.
    
    Parameters:
    - html: HTML fragment string to parse
    
    Returns:
    List of Node objects representing the parsed HTML fragment
    """

Usage Example:

from selectolax.parser import create_tag, parse_fragment

# Create new elements
div = create_tag('div')
paragraph = create_tag('p')
link = create_tag('a')

# Parse HTML fragments without wrappers
fragment_html = '<li>Item 1</li><li>Item 2</li><li>Item 3</li>'
list_items = parse_fragment(fragment_html)

# Use in DOM manipulation
container = create_tag('ul')
for item in list_items:
    container.insert_child(item)

print(container.html)  # <ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>

Document Cloning

Create independent copies of parsed documents.

def clone(self) -> HTMLParser:
    """
    Create a deep copy of the entire parsed document.
    
    Returns:
    New HTMLParser instance with identical content
    """

Usage Example:

# Clone document for safe manipulation
original = HTMLParser(html_content)
copy = original.clone()

# Modify copy without affecting original
copy.strip_tags(['script', 'style'])
clean_text = copy.text(strip=True)

# Original remains unchanged
original_text = original.text()

Text Processing

Advanced text manipulation methods for better text extraction.

def merge_text_nodes(self) -> None:
    """
    Merge adjacent text nodes to improve text extraction quality.
    
    Useful after removing HTML tags to eliminate extra spaces
    and fragmented text caused by tag removal.
    """

Usage Example:

# Clean up text after tag manipulation
parser = HTMLParser('<div><strong>Hello</strong> world!</div>')
content = parser.css_first('div')

# Remove formatting tags
parser.unwrap_tags(['strong'])
print(parser.text())  # May have extra spaces: "Hello  world!"

# Merge text nodes for cleaner output
parser.merge_text_nodes()
print(parser.text())  # Clean output: "Hello world!"

Install with Tessl CLI