tessl/pypi-selectolax

Fast HTML5 parser with CSS selectors using Modest and Lexbor engines

—

Pending

Overview

Eval results

Files

Enhanced Parsing with Lexbor Engine

Name: tessl/pypi-selectolax
Author: tessl

Alternative HTML5 parser using the Lexbor engine. Offers enhanced CSS selector capabilities including custom pseudo-classes for advanced text matching, improved performance characteristics, and extended selector support beyond standard CSS.

Capabilities

LexborHTMLParser Class

Fast HTML parser with enhanced CSS selector support and custom pseudo-classes for advanced document querying.

class LexborHTMLParser:
    def __init__(self, html: str | bytes):
        """
        Initialize Lexbor HTML parser.
        
        Parameters:
        - html: HTML content as string or bytes
        """

Usage Example:

from selectolax.lexbor import LexborHTMLParser

# Parse HTML content
html = '<div><p>Hello world</p><p class="special">Special content</p></div>'
parser = LexborHTMLParser(html)

# Parse from bytes
html_bytes = b'<html><body><h1>Title</h1></body></html>'
parser = LexborHTMLParser(html_bytes)

Enhanced CSS Selectors

Advanced CSS selector capabilities including custom pseudo-classes for text matching and extended selector support.

def css(self, query: str) -> list[LexborNode]:
    """
    Find elements using enhanced CSS selectors.
    
    Supports standard CSS selectors plus Lexbor extensions:
    - :lexbor-contains("text") - case-sensitive text matching
    - :lexbor-contains("text" i) - case-insensitive text matching
    
    Parameters:
    - query: CSS selector with optional Lexbor extensions
    
    Returns:
    List of LexborNode objects matching the selector
    """

def css_first(self, query: str, default=None, strict: bool = False) -> LexborNode | None:
    """
    Find first element with enhanced CSS selectors.
    
    Parameters:
    - query: CSS selector string with optional Lexbor extensions
    - default: Value to return if no match found
    - strict: If True, error when multiple matches exist
    
    Returns:
    First matching LexborNode object or default value
    """

Usage Example:

# Standard CSS selectors
paragraphs = parser.css('p.content')
first_div = parser.css_first('div')

# Lexbor custom pseudo-classes - case sensitive
awesome_nodes = parser.css('p:lexbor-contains("awesome")')

# Lexbor custom pseudo-classes - case insensitive  
case_insensitive = parser.css('p:lexbor-contains("HELLO" i)')

# Complex selectors with custom pseudo-classes
specific = parser.css('div.content p:lexbor-contains("important" i)')

Tag-Based Selection

Select elements by tag name with improved performance over the Modest engine.

def tags(self, name: str) -> list[LexborNode]:
    """
    Find all elements with specified tag name.
    
    Parameters:
    - name: HTML tag name (e.g., 'div', 'p', 'a')
    
    Returns:
    List of LexborNode objects with matching tag name
    """

Usage Example:

# Get all links
links = parser.tags('a')

# Get all headings
headings = parser.tags('h1')

# Get all list items
items = parser.tags('li')

Text Extraction

Extract text content with enhanced performance and consistent behavior.

def text(self, deep: bool = True, separator: str = '', strip: bool = False) -> str:
    """
    Extract text content from document body.
    
    Parameters:
    - deep: Include text from child elements
    - separator: String to join text from different elements
    - strip: Apply str.strip() to each text part
    
    Returns:
    Extracted text content as string
    """

Usage Example:

# Get all text content
all_text = parser.text()

# Get text with separators
separated_text = parser.text(separator=' | ')

# Get clean text without extra whitespace
clean_text = parser.text(strip=True)

# Get only direct text content
direct_text = parser.text(deep=False)

DOM Tree Access

Access document structure with enhanced node types and consistent interface.

@property
def root(self) -> LexborNode | None:
    """Returns root HTML element node."""

@property
def head(self) -> LexborNode | None:
    """Returns HTML head element node."""

@property
def body(self) -> LexborNode | None:
    """Returns HTML body element node."""

@property
def html(self) -> str | None:
    """Returns HTML representation of the document."""

@property
def raw_html(self) -> bytes:
    """Returns raw HTML bytes used for parsing."""

@property
def selector(self) -> LexborCSSSelector:
    """Returns CSS selector instance for advanced queries."""

Usage Example:

# Access document parts
root = parser.root
head = parser.head  
body = parser.body

# Get HTML output
html_output = parser.html

# Access raw input
original = parser.raw_html

# Get selector for advanced operations
css_selector = parser.selector

Advanced CSS Selector Interface

Direct access to the underlying CSS selector engine for advanced use cases.

class LexborCSSSelector:
    def find(self, query: str, node: LexborNode) -> list[LexborNode]:
        """
        Find elements matching selector within given node.
        
        Parameters:
        - query: CSS selector string
        - node: Root node to search within
        
        Returns:
        List of matching LexborNode objects
        """
    
    def any_matches(self, query: str, node: LexborNode) -> bool:
        """
        Check if any elements match selector.
        
        Parameters:
        - query: CSS selector string
        - node: Root node to search within
        
        Returns:
        True if any matches exist, False otherwise
        """

Usage Example:

# Get selector instance
selector = parser.selector

# Search within specific node
content_div = parser.css_first('div.content')
if content_div:
    matches = selector.find('p.important', content_div)

# Check for existence without retrieving
has_errors = selector.any_matches('.error', parser.root)

Utility Functions

Element Creation

Create new HTML elements programmatically.

def create_tag(tag: str) -> LexborNode:
    """
    Create new HTML element with specified tag name.
    
    Parameters:
    - tag: HTML tag name (e.g., 'div', 'p', 'img')
    
    Returns:
    New LexborNode element with the specified tag
    """

def parse_fragment(html: str) -> list[LexborNode]:
    """
    Parse HTML fragment into list of nodes without adding wrapper elements.
    
    Unlike LexborHTMLParser which adds missing html/head/body tags, this function
    returns nodes exactly as specified in the HTML fragment.
    
    Parameters:
    - html: HTML fragment string to parse
    
    Returns:
    List of LexborNode objects representing the parsed HTML fragment
    """

Usage Example:

from selectolax.lexbor import create_tag, parse_fragment

# Create simple elements
div = create_tag('div')
paragraph = create_tag('p')
link = create_tag('a')

# Parse HTML fragments without wrappers
fragment_html = '<span>Text 1</span><span>Text 2</span>'
spans = parse_fragment(fragment_html)

# Use in DOM manipulation
container = create_tag('div')
for span in spans:
    container.insert_child(span)

print(container.html)  # <div><span>Text 1</span><span>Text 2</span></div>

Document Cloning

Create independent copies of parsed documents.

def clone(self) -> LexborHTMLParser:
    """
    Create a deep copy of the entire parsed document.
    
    Returns:
    New LexborHTMLParser instance with identical content
    """

Usage Example:

# Clone document for safe manipulation
original = LexborHTMLParser(html_content)
backup = original.clone()

# Modify original without affecting backup
original.strip_tags(['img'])
processed_text = original.text(strip=True)

# Backup remains unchanged
original_html = backup.html

Text Processing

Advanced text manipulation methods for better text extraction.

def merge_text_nodes(self) -> None:
    """
    Merge adjacent text nodes to improve text extraction quality.
    
    Useful after removing HTML tags to eliminate extra spaces
    and fragmented text caused by tag removal.
    """

Usage Example:

# Clean up text after tag manipulation
parser = LexborHTMLParser('<div><em>Hello</em> <strong>world</strong>!</div>')

# Remove formatting tags
parser.unwrap_tags(['em', 'strong'])
print(parser.text())  # May have extra spaces: "Hello  world !"

# Merge text nodes for cleaner output
parser.merge_text_nodes()
print(parser.text())  # Clean output: "Hello world!"

Install with Tessl CLI