Fast HTML5 parser with CSS selectors using Modest and Lexbor engines
—
Alternative HTML5 parser using the Lexbor engine. Offers enhanced CSS selector capabilities including custom pseudo-classes for advanced text matching, improved performance characteristics, and extended selector support beyond standard CSS.
Fast HTML parser with enhanced CSS selector support and custom pseudo-classes for advanced document querying.
class LexborHTMLParser:
def __init__(self, html: str | bytes):
"""
Initialize Lexbor HTML parser.
Parameters:
- html: HTML content as string or bytes
"""Usage Example:
from selectolax.lexbor import LexborHTMLParser
# Parse HTML content
html = '<div><p>Hello world</p><p class="special">Special content</p></div>'
parser = LexborHTMLParser(html)
# Parse from bytes
html_bytes = b'<html><body><h1>Title</h1></body></html>'
parser = LexborHTMLParser(html_bytes)Advanced CSS selector capabilities including custom pseudo-classes for text matching and extended selector support.
def css(self, query: str) -> list[LexborNode]:
"""
Find elements using enhanced CSS selectors.
Supports standard CSS selectors plus Lexbor extensions:
- :lexbor-contains("text") - case-sensitive text matching
- :lexbor-contains("text" i) - case-insensitive text matching
Parameters:
- query: CSS selector with optional Lexbor extensions
Returns:
List of LexborNode objects matching the selector
"""
def css_first(self, query: str, default=None, strict: bool = False) -> LexborNode | None:
"""
Find first element with enhanced CSS selectors.
Parameters:
- query: CSS selector string with optional Lexbor extensions
- default: Value to return if no match found
- strict: If True, error when multiple matches exist
Returns:
First matching LexborNode object or default value
"""Usage Example:
# Standard CSS selectors
paragraphs = parser.css('p.content')
first_div = parser.css_first('div')
# Lexbor custom pseudo-classes - case sensitive
awesome_nodes = parser.css('p:lexbor-contains("awesome")')
# Lexbor custom pseudo-classes - case insensitive
case_insensitive = parser.css('p:lexbor-contains("HELLO" i)')
# Complex selectors with custom pseudo-classes
specific = parser.css('div.content p:lexbor-contains("important" i)')Select elements by tag name with improved performance over the Modest engine.
def tags(self, name: str) -> list[LexborNode]:
"""
Find all elements with specified tag name.
Parameters:
- name: HTML tag name (e.g., 'div', 'p', 'a')
Returns:
List of LexborNode objects with matching tag name
"""Usage Example:
# Get all links
links = parser.tags('a')
# Get all headings
headings = parser.tags('h1')
# Get all list items
items = parser.tags('li')Extract text content with enhanced performance and consistent behavior.
def text(self, deep: bool = True, separator: str = '', strip: bool = False) -> str:
"""
Extract text content from document body.
Parameters:
- deep: Include text from child elements
- separator: String to join text from different elements
- strip: Apply str.strip() to each text part
Returns:
Extracted text content as string
"""Usage Example:
# Get all text content
all_text = parser.text()
# Get text with separators
separated_text = parser.text(separator=' | ')
# Get clean text without extra whitespace
clean_text = parser.text(strip=True)
# Get only direct text content
direct_text = parser.text(deep=False)Access document structure with enhanced node types and consistent interface.
@property
def root(self) -> LexborNode | None:
"""Returns root HTML element node."""
@property
def head(self) -> LexborNode | None:
"""Returns HTML head element node."""
@property
def body(self) -> LexborNode | None:
"""Returns HTML body element node."""
@property
def html(self) -> str | None:
"""Returns HTML representation of the document."""
@property
def raw_html(self) -> bytes:
"""Returns raw HTML bytes used for parsing."""
@property
def selector(self) -> LexborCSSSelector:
"""Returns CSS selector instance for advanced queries."""Usage Example:
# Access document parts
root = parser.root
head = parser.head
body = parser.body
# Get HTML output
html_output = parser.html
# Access raw input
original = parser.raw_html
# Get selector for advanced operations
css_selector = parser.selectorDirect access to the underlying CSS selector engine for advanced use cases.
class LexborCSSSelector:
def find(self, query: str, node: LexborNode) -> list[LexborNode]:
"""
Find elements matching selector within given node.
Parameters:
- query: CSS selector string
- node: Root node to search within
Returns:
List of matching LexborNode objects
"""
def any_matches(self, query: str, node: LexborNode) -> bool:
"""
Check if any elements match selector.
Parameters:
- query: CSS selector string
- node: Root node to search within
Returns:
True if any matches exist, False otherwise
"""Usage Example:
# Get selector instance
selector = parser.selector
# Search within specific node
content_div = parser.css_first('div.content')
if content_div:
matches = selector.find('p.important', content_div)
# Check for existence without retrieving
has_errors = selector.any_matches('.error', parser.root)Create new HTML elements programmatically.
def create_tag(tag: str) -> LexborNode:
"""
Create new HTML element with specified tag name.
Parameters:
- tag: HTML tag name (e.g., 'div', 'p', 'img')
Returns:
New LexborNode element with the specified tag
"""
def parse_fragment(html: str) -> list[LexborNode]:
"""
Parse HTML fragment into list of nodes without adding wrapper elements.
Unlike LexborHTMLParser which adds missing html/head/body tags, this function
returns nodes exactly as specified in the HTML fragment.
Parameters:
- html: HTML fragment string to parse
Returns:
List of LexborNode objects representing the parsed HTML fragment
"""Usage Example:
from selectolax.lexbor import create_tag, parse_fragment
# Create simple elements
div = create_tag('div')
paragraph = create_tag('p')
link = create_tag('a')
# Parse HTML fragments without wrappers
fragment_html = '<span>Text 1</span><span>Text 2</span>'
spans = parse_fragment(fragment_html)
# Use in DOM manipulation
container = create_tag('div')
for span in spans:
container.insert_child(span)
print(container.html) # <div><span>Text 1</span><span>Text 2</span></div>Create independent copies of parsed documents.
def clone(self) -> LexborHTMLParser:
"""
Create a deep copy of the entire parsed document.
Returns:
New LexborHTMLParser instance with identical content
"""Usage Example:
# Clone document for safe manipulation
original = LexborHTMLParser(html_content)
backup = original.clone()
# Modify original without affecting backup
original.strip_tags(['img'])
processed_text = original.text(strip=True)
# Backup remains unchanged
original_html = backup.htmlAdvanced text manipulation methods for better text extraction.
def merge_text_nodes(self) -> None:
"""
Merge adjacent text nodes to improve text extraction quality.
Useful after removing HTML tags to eliminate extra spaces
and fragmented text caused by tag removal.
"""Usage Example:
# Clean up text after tag manipulation
parser = LexborHTMLParser('<div><em>Hello</em> <strong>world</strong>!</div>')
# Remove formatting tags
parser.unwrap_tags(['em', 'strong'])
print(parser.text()) # May have extra spaces: "Hello world !"
# Merge text nodes for cleaner output
parser.merge_text_nodes()
print(parser.text()) # Clean output: "Hello world!"Install with Tessl CLI
npx tessl i tessl/pypi-selectolax