Fast HTML5 parser with CSS selectors using Modest and Lexbor engines
npx @tessl/cli install tessl/pypi-selectolax@0.3.0A high-performance HTML5 parser with CSS selector support, providing two parsing backends (Modest and Lexbor engines) for maximum compatibility and speed. selectolax enables efficient HTML document parsing and manipulation with a Python API that supports advanced CSS selectors, attribute access, text extraction, and DOM traversal operations.
pip install selectolaxModest engine (default):
from selectolax.parser import HTMLParserLexbor engine (enhanced CSS selectors):
from selectolax.lexbor import LexborHTMLParserUtility functions:
# Element creation and fragment parsing
from selectolax.parser import create_tag, parse_fragment
from selectolax.lexbor import create_tag, parse_fragment
# Exception handling
from selectolax.lexbor import SelectolaxErrorfrom selectolax.parser import HTMLParser
# Parse HTML content
html = """
<html>
<head><title>Sample Page</title></head>
<body>
<div class="content">
<h1 id="title">Hello World</h1>
<p class="text">This is a paragraph.</p>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</div>
</body>
</html>
"""
# Create parser instance
tree = HTMLParser(html)
# Extract text using CSS selectors
title = tree.css_first('h1#title').text()
print(f"Title: {title}") # Output: Title: Hello World
# Get all list items
items = [node.text() for node in tree.css('li')]
print(f"Items: {items}") # Output: Items: ['Item 1', 'Item 2']
# Access attributes
title_id = tree.css_first('h1').attributes['id']
print(f"Title ID: {title_id}") # Output: Title ID: title
# Extract all text content
all_text = tree.text(strip=True)
print(f"All text: {all_text}")selectolax provides two high-performance HTML parsing engines:
:lexbor-contains)Both engines expose similar APIs through their respective parser classes (HTMLParser and LexborHTMLParser) and node classes (Node and LexborNode), allowing easy switching between backends while maintaining compatibility.
The parsing workflow involves:
The primary HTML5 parser using the Modest engine. Provides comprehensive parsing capabilities with automatic encoding detection, CSS selector support, and DOM manipulation methods.
class HTMLParser:
def __init__(self, html, detect_encoding=True, use_meta_tags=True, decode_errors='ignore'): ...
def css(self, query: str) -> list: ...
def css_first(self, query: str, default=None, strict=False): ...
def tags(self, name: str) -> list: ...
def text(self, deep=True, separator='', strip=False) -> str: ...Alternative HTML5 parser using the Lexbor engine. Offers enhanced CSS selector capabilities including custom pseudo-classes for advanced text matching and improved performance characteristics.
class LexborHTMLParser:
def __init__(self, html): ...
def css(self, query: str) -> list: ...
def css_first(self, query: str, default=None, strict=False): ...
def tags(self, name: str) -> list: ...
def text(self, deep=True, separator='', strip=False) -> str: ...Comprehensive node manipulation capabilities for traversing, modifying, and extracting data from parsed HTML documents. Includes text extraction, attribute access, and structural modifications.
class Node:
def css(self, query: str) -> list: ...
def css_first(self, query: str, default=None, strict=False): ...
def text(self, deep=True, separator='', strip=False) -> str: ...
def remove(self) -> None: ...
def decompose(self) -> None: ...# HTML content input types
HtmlInput = str | bytes
# CSS selector query type
CssQuery = str
# Attribute dictionary interface
class AttributeDict:
def __getitem__(self, key: str) -> str | None: ...
def __setitem__(self, key: str, value: str) -> None: ...
def __contains__(self, key: str) -> bool: ...
def get(self, key: str, default=None) -> str | None: ...
def keys(self) -> Iterator[str]: ...
def values(self) -> Iterator[str | None]: ...
def items(self) -> Iterator[tuple[str, str | None]]: ...
# Exception classes
class SelectolaxError(Exception):
"""Base exception for selectolax-related errors."""
pass