Fast HTML5 parser with CSS selectors using Modest and Lexbor engines
—
Comprehensive node manipulation capabilities for traversing, modifying, and extracting data from parsed HTML documents. Includes text extraction, attribute access, structural navigation, and DOM modifications for both Node (Modest engine) and LexborNode (Lexbor engine) types.
HTML element representation with full DOM manipulation capabilities.
class Node:
"""HTML node using Modest engine."""
pass
class LexborNode:
"""HTML node using Lexbor engine."""
passBoth classes provide identical interfaces with the same methods and properties.
Apply CSS selectors to specific nodes for scoped element searching.
def css(self, query: str) -> list[Node]:
"""
Find child elements matching CSS selector.
Parameters:
- query: CSS selector string
Returns:
List of Node objects matching selector within this node's subtree
"""
def css_first(self, query: str, default=None, strict: bool = False) -> Node | None:
"""
Find first child element matching CSS selector.
Parameters:
- query: CSS selector string
- default: Value to return if no match found
- strict: If True, error when multiple matches exist
Returns:
First matching Node object or default value
"""Usage Example:
# Find within specific container
container = parser.css_first('div.content')
if container:
# Search only within container
links = container.css('a')
first_paragraph = container.css_first('p')
# Nested selection
important_items = container.css('ul.important li')Extract text content from individual nodes with flexible formatting options.
def text(self, deep: bool = True, separator: str = '', strip: bool = False) -> str:
"""
Extract text content from this node.
Parameters:
- deep: Include text from child elements
- separator: String to join text from different child elements
- strip: Apply str.strip() to each text part
Returns:
Text content as string
"""Usage Example:
# Get text from specific element
title = parser.css_first('h1').text()
# Get text with custom formatting
nav_text = nav_element.text(separator=' | ', strip=True)
# Get only direct text (no children)
button_text = button_element.text(deep=False)
# Extract from multiple elements
article_texts = [p.text(strip=True) for p in article.css('p')]Access structural information and content of HTML nodes.
@property
def tag(self) -> str:
"""HTML tag name (e.g., 'div', 'p', 'a')."""
@property
def attributes(self) -> dict:
"""Read-only dictionary of element attributes."""
@property
def attrs(self) -> AttributeDict:
"""Mutable dictionary-like access to element attributes."""
@property
def parent(self) -> Node | None:
"""Parent node in DOM tree."""
@property
def next(self) -> Node | None:
"""Next sibling node."""
@property
def prev(self) -> Node | None:
"""Previous sibling node."""
@property
def child(self) -> Node | None:
"""First child node."""
@property
def last_child(self) -> Node | None:
"""Last child node."""
@property
def html(self) -> str:
"""HTML representation of this node and its children."""
@property
def id(self) -> str | None:
"""HTML id attribute value (Node only)."""
@property
def mem_id(self) -> int:
"""Memory address identifier for the node."""
@property
def tag_id(self) -> int:
"""Numeric tag identifier (LexborNode only)."""
@property
def first_child(self) -> Node | None:
"""First child node (alias for child in LexborNode)."""
@property
def raw_value(self) -> bytes:
"""Raw unparsed value of text node (Node only)."""
@property
def text_content(self) -> str | None:
"""Text content of this specific node only (not children)."""Usage Example:
# Access node properties
element = parser.css_first('div.content')
tag_name = element.tag # 'div'
class_attr = element.attributes['class'] # 'content' (read-only)
parent_element = element.parent
next_sibling = element.next
# Navigate DOM tree
first_child = element.child
last_child = element.last_child
# Get HTML output
html_content = element.html
# Access additional properties
element_id = element.id # HTML id attribute (if exists)
memory_id = element.mem_id # Unique memory identifier
# Direct text content (no children)
text_node = parser.css_first('p').child # Get text node
if text_node and text_node.text_content:
direct_text = text_node.text_content # Text of this node onlyDictionary-like interface for accessing and modifying HTML attributes.
class AttributeDict:
def __getitem__(self, key: str) -> str | None:
"""Get attribute value by name."""
def __setitem__(self, key: str, value: str) -> None:
"""Set attribute value."""
def __delitem__(self, key: str) -> None:
"""Remove attribute."""
def __contains__(self, key: str) -> bool:
"""Check if attribute exists."""
def get(self, key: str, default=None) -> str | None:
"""Get attribute with default value."""
def sget(self, key: str, default: str = "") -> str:
"""Get attribute, return empty string for None values."""
def keys(self) -> Iterator[str]:
"""Iterator over attribute names."""
def values(self) -> Iterator[str | None]:
"""Iterator over attribute values."""
def items(self) -> Iterator[tuple[str, str | None]]:
"""Iterator over (name, value) pairs."""Usage Example:
# Access attributes (read-only)
link = parser.css_first('a')
read_only_attrs = link.attributes # dict
href = read_only_attrs['href']
# Access mutable attributes
attrs = link.attrs # AttributeDict
# Get attributes with different methods
href = attrs['href']
title = attrs.get('title', 'No title')
class_name = attrs.sget('class', 'no-class') # Returns "" instead of None
# Set and modify attributes (only works with attrs, not attributes)
attrs['target'] = '_blank'
attrs['rel'] = 'noopener'
# Check existence
has_id = 'id' in attrs
# Remove attributes
del attrs['onclick']
# Iterate attributes
for name, value in attrs.items():
print(f"{name}: {value}")
# Read-only vs mutable comparison
print(link.attributes) # {'href': 'example.com', 'class': 'link'}
link.attrs['new-attr'] = 'value'
print(link.attributes) # {'href': 'example.com', 'class': 'link', 'new-attr': 'value'}Modify document structure by adding, removing, and replacing elements.
def remove(self) -> None:
"""Remove this node from DOM tree."""
def decompose(self) -> None:
"""Remove and destroy this node and all children."""
def unwrap(self) -> None:
"""Remove tag wrapper while keeping child content."""
def replace_with(self, value: str | bytes | Node) -> None:
"""Replace this node with text or another node."""
def insert_before(self, value: str | bytes | Node) -> None:
"""Insert text or node before this node."""
def insert_after(self, value: str | bytes | Node) -> None:
"""Insert text or node after this node."""
def insert_child(self, value: str | bytes | Node) -> None:
"""Insert text or node as child (at end) of this node."""Usage Example:
# Remove elements
script_tags = parser.css('script')
for script in script_tags:
script.remove()
# Destroy elements completely
ads = parser.css('.advertisement')
for ad in ads:
ad.decompose()
# Unwrap formatting tags
bold_tags = parser.css('b')
for bold in bold_tags:
bold.unwrap() # Keeps text, removes <b> wrapper
# Replace with text
old_img = parser.css_first('img')
if old_img:
alt_text = old_img.attributes.get('alt', 'Image')
old_img.replace_with(alt_text) # Replace with text
# Replace with another node
from selectolax.lexbor import create_tag
new_img = create_tag('img', {'src': 'new.jpg', 'alt': 'New image'})
old_img.replace_with(new_img)
# Insert text and nodes
container = parser.css_first('div.content')
container.insert_child('Added text at end')
container.insert_after('Text after container')
container.insert_before('Text before container')
# Insert HTML elements
new_paragraph = create_tag('p', {'class': 'inserted'})
container.insert_child(new_paragraph)Perform operations on multiple elements efficiently.
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
"""
Remove specified child tags from this node.
Parameters:
- tags: List of tag names to remove
- recursive: Remove all descendants with matching tags
"""
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
"""
Unwrap specified child tags while keeping content.
Parameters:
- tags: List of tag names to unwrap
- delete_empty: Remove empty tags after unwrapping
"""Usage Example:
# Clean up content section
content = parser.css_first('div.content')
if content:
# Remove unwanted tags
content.strip_tags(['script', 'style', 'noscript'])
# Unwrap formatting tags
content.unwrap_tags(['span', 'font'], delete_empty=True)
# Process article content
article = parser.css_first('article')
if article:
# Remove all ads and tracking
article.strip_tags(['iframe', 'object', 'embed'], recursive=True)
# Clean up empty containers
article.unwrap_tags(['div', 'span'], delete_empty=True)Iterate through child nodes and traverse the DOM tree structure.
def iter(self, include_text: bool = False) -> Iterator[Node]:
"""
Iterate over child nodes at current level (Node only).
Parameters:
- include_text: Include text nodes in iteration
Yields:
Node objects for each child element
"""
def traverse(self, include_text: bool = False) -> Iterator[Node]:
"""
Depth-first traversal of all descendant nodes (Node only).
Parameters:
- include_text: Include text nodes in traversal
Yields:
Node objects in depth-first order
"""Usage Example:
# Iterate over direct children only
container = parser.css_first('div.content')
for child in container.iter():
print(f"Child tag: {child.tag}")
# Include text nodes
for child in container.iter(include_text=True):
if child.tag == '-text':
print(f"Text content: {child.text()}")
# Traverse entire subtree
for node in container.traverse():
print(f"Descendant: {node.tag}")
# Deep traversal including text
all_nodes = [node for node in container.traverse(include_text=True)]
### Text Node Processing
Merge adjacent text nodes for cleaner text extraction.
```python { .api }
def merge_text_nodes(self) -> None:
"""
Merge adjacent text nodes within this node.
Useful after removing HTML tags to eliminate extra spaces
and fragmented text caused by tag removal.
"""Usage Example:
# Clean up fragmented text nodes
html = '<div><strong>Hello</strong> <em>beautiful</em> world!</div>'
parser = HTMLParser(html)
container = parser.css_first('div')
# Remove formatting tags
container.unwrap_tags(['strong', 'em'])
print(container.text()) # May show: "Hello beautiful world!"
# Merge text nodes for cleaner output
container.merge_text_nodes()
print(container.text()) # Clean output: "Hello beautiful world!"
# Works with any node
article = parser.css_first('article')
if article:
# Clean up after removing unwanted tags
article.strip_tags(['script', 'style'])
article.merge_text_nodes()
clean_text = article.text(strip=True)Check if nodes match CSS selectors without retrieving results.
def css_matches(self, selector: str) -> bool:
"""
Check if this node matches CSS selector.
Parameters:
- selector: CSS selector string
Returns:
True if node matches selector, False otherwise
"""
def any_css_matches(self, selectors: tuple[str, ...]) -> bool:
"""
Check if node matches any of multiple CSS selectors.
Parameters:
- selectors: Tuple of CSS selector strings
Returns:
True if node matches any selector, False otherwise
"""Usage Example:
# Check if element matches selector
element = parser.css_first('div')
is_content = element.css_matches('.content')
is_container = element.css_matches('.container')
# Check against multiple selectors
important_selectors = ('.important', '.critical', '.error')
is_important = element.any_css_matches(important_selectors)
# Conditional processing based on matching
if element.css_matches('.article'):
# Process as article
process_article(element)
elif element.css_matches('.sidebar'):
# Process as sidebar
process_sidebar(element)Additional text extraction methods for specialized use cases.
def text_lexbor(self) -> str:
"""
Extract text using Lexbor's built-in method (LexborNode only).
Uses the underlying Lexbor engine's native text extraction.
Faster for simple text extraction without formatting options.
Returns:
Text content as string
Raises:
RuntimeError: If text extraction fails
"""Usage Example:
from selectolax.lexbor import LexborHTMLParser
# Use Lexbor's native text extraction
parser = LexborHTMLParser('<div>Hello <b>world</b>!</div>')
element = parser.css_first('div')
# Fast native text extraction
native_text = element.text_lexbor() # "Hello world!"
# Compare with regular text method
regular_text = element.text() # Same result but more options
# Use native method for performance-critical applications
articles = parser.css('article')
all_text = [article.text_lexbor() for article in articles]Additional methods for enhanced selection and content analysis.
def select(self, query: str = None) -> Selector:
"""
Create advanced selector with chaining support (Node only).
Parameters:
- query: Optional initial CSS selector
Returns:
Selector object supporting method chaining
"""
def scripts_contain(self, query: str) -> bool:
"""
Check if any child script tags contain text (Node only).
Caches script tags on first call for performance.
Parameters:
- query: Text to search for in script content
Returns:
True if any script contains the text, False otherwise
"""Usage Example:
# Advanced selector with chaining
container = parser.css_first('div.content')
selector = container.select('p.important')
# Can chain additional operations on selector
# Check for script content within specific nodes
article = parser.css_first('article')
has_tracking = article.scripts_contain('analytics')
has_ads = article.scripts_contain('adsystem')
# Raw value access for text nodes
html_with_entities = '<div><test></div>'
parser = HTMLParser(html_with_entities)
text_node = parser.css_first('div').child
print(text_node.text()) # "<test>" (parsed)
print(text_node.raw_value) # b"<test>" (original)Create new nodes and clone existing ones for DOM manipulation.
# For LexborNode only
def create_tag(name: str, attrs: dict = None) -> LexborNode:
"""
Create new HTML element (Lexbor engine only).
Parameters:
- name: HTML tag name
- attrs: Dictionary of attributes
Returns:
New LexborNode element
"""Usage Example:
from selectolax.lexbor import create_tag
# Create new elements
wrapper = create_tag('div', {'class': 'wrapper'})
link = create_tag('a', {'href': '#', 'class': 'button'})
# Build complex structures
container = create_tag('div', {'class': 'container'})
header = create_tag('h2', {'class': 'title'})
paragraph = create_tag('p', {'class': 'description'})
# Note: Node insertion and complex DOM building
# requires working with the underlying parser APIsInstall with Tessl CLI
npx tessl i tessl/pypi-selectolax