Parsel is a library to extract data from HTML and XML using XPath and CSS selectors
—
Custom XPath functions for enhanced element selection including CSS class checking and other utility functions. Parsel extends lxml's XPath capabilities with domain-specific functions for web scraping and document processing.
Register custom XPath functions for use in XPath expressions.
def set_xpathfunc(fname: str, func: Optional[Callable]) -> None:
"""
Register a custom extension function for XPath expressions.
Parameters:
- fname (str): Function name to register in XPath namespace
- func (Callable, optional): Function to register, or None to remove
Note:
- Functions are registered in the global XPath namespace (None)
- Registered functions persist for the lifetime of the process
- Functions receive context parameter plus any XPath arguments
- Setting func=None removes the function registration
Function Signature:
- func(context, *args) -> Any
- context: lxml evaluation context
- *args: Arguments passed from XPath expression
"""Usage Example:
from parsel import Selector
from parsel.xpathfuncs import set_xpathfunc
# Define custom XPath function
def has_word(context, word):
"""Check if element text contains a specific word."""
node_text = context.context_node.text or ""
return word.lower() in node_text.lower()
# Register the function
set_xpathfunc('has-word', has_word)
html = """
<div>
<p>This paragraph contains Python programming content.</p>
<p>This paragraph discusses JavaScript frameworks.</p>
<p>This paragraph covers HTML markup basics.</p>
</div>
"""
selector = Selector(text=html)
# Use custom function in XPath
python_paragraphs = selector.xpath('//p[has-word("Python")]')
programming_content = python_paragraphs.xpath('.//text()').get()
# Returns: 'This paragraph contains Python programming content.'
# Remove the function
set_xpathfunc('has-word', None)
# Function is no longer available
# selector.xpath('//p[has-word("test")]') # Would raise errorInitialize all built-in XPath extension functions.
def setup() -> None:
"""
Register all built-in XPath extension functions.
Currently registers:
- has-class: Check if element has specific CSS classes
This function is called automatically when parsel is imported.
"""Built-in XPath function for checking CSS class membership.
def has_class(context: Any, *classes: str) -> bool:
"""
XPath extension function to check if element has specific CSS classes.
Parameters:
- context: lxml XPath evaluation context (automatic)
- *classes: CSS class names to check for
Returns:
bool: True if all specified classes are present in element's class attribute
Raises:
- ValueError: If no classes provided or arguments are not strings
Note:
- Handles HTML5 whitespace normalization
- Requires ALL specified classes to be present (AND operation)
- Case-sensitive class matching
- Automatically registered as 'has-class' function
"""Usage Example:
from parsel import Selector
html = """
<div class="container main-content active">
<p class="text primary">Primary text paragraph</p>
<p class="text secondary highlighted">Secondary text paragraph</p>
<p class="text">Basic text paragraph</p>
<span class="label important urgent">Urgent label</span>
</div>
"""
selector = Selector(text=html)
# Check for single class
text_elements = selector.xpath('//p[has-class("text")]')
print(f"Elements with 'text' class: {len(text_elements)}") # 3
# Check for multiple classes (all must be present)
primary_text = selector.xpath('//p[has-class("text", "primary")]')
print(f"Elements with both 'text' and 'primary': {len(primary_text)}") # 1
# Check for multiple classes on different element
urgent_labels = selector.xpath('//span[has-class("label", "important", "urgent")]')
print(f"Urgent important labels: {len(urgent_labels)}") # 1
# Complex combinations
highlighted_secondary = selector.xpath('//p[has-class("secondary", "highlighted")]')
highlighted_text = highlighted_secondary.xpath('.//text()').get()
# Returns: 'Secondary text paragraph'
# Check container classes
main_containers = selector.xpath('//div[has-class("container", "main-content")]')
print(f"Main content containers: {len(main_containers)}") # 1
# Non-matching example
nonexistent = selector.xpath('//p[has-class("text", "nonexistent")]')
print(f"Non-matching elements: {len(nonexistent)}") # 0Combine custom XPath functions with standard XPath features.
Usage Example:
# Define additional custom functions
def contains_number(context):
"""Check if element text contains any numeric digits."""
import re
node_text = context.context_node.text or ""
return bool(re.search(r'\d', node_text))
def text_length_gt(context, min_length):
"""Check if element text length is greater than specified value."""
node_text = context.context_node.text or ""
return len(node_text.strip()) > int(min_length)
# Register functions
set_xpathfunc('contains-number', contains_number)
set_xpathfunc('text-length-gt', text_length_gt)
html = """
<article>
<h1 class="title main">Article About Data Science in 2024</h1>
<p class="intro short">Brief intro.</p>
<p class="content long">This is a comprehensive paragraph about machine learning
algorithms and their applications in modern data science. It contains detailed
explanations and examples.</p>
<p class="stats">Processing 1000 records per second with 95% accuracy.</p>
<p class="conclusion">Final thoughts on the topic.</p>
</article>
"""
selector = Selector(text=html)
# Combine has-class with custom functions
long_content = selector.xpath('//p[has-class("content") and text-length-gt("50")]')
print(f"Long content paragraphs: {len(long_content)}")
# Find elements with numbers that have specific classes
stats_with_numbers = selector.xpath('//p[has-class("stats") and contains-number()]')
stats_text = stats_with_numbers.xpath('.//text()').get()
# Returns: 'Processing 1000 records per second with 95% accuracy.'
# Complex conditions
title_with_year = selector.xpath('//h1[has-class("title") and contains-number()]')
title_text = title_with_year.xpath('.//text()').get()
# Returns: 'Article About Data Science in 2024'
# Multiple custom functions
long_paragraphs_no_numbers = selector.xpath('//p[text-length-gt("20") and not(contains-number())]')
print(f"Long paragraphs without numbers: {len(long_paragraphs_no_numbers)}")XPath extension functions include built-in validation and error handling.
Usage Example:
html = """
<div class="test">
<p class="item valid">Valid content</p>
<p class="item">Basic content</p>
</div>
"""
selector = Selector(text=html)
# Test error conditions
try:
# Empty class list - should raise ValueError
result = selector.xpath('//p[has-class()]')
except Exception as e:
print(f"Expected error for empty classes: {type(e).__name__}")
try:
# Non-string class argument - should raise ValueError
# Note: This would be caught during XPath evaluation
result = selector.xpath('//p[has-class("valid", 123)]')
except Exception as e:
print(f"Error for non-string argument: {type(e).__name__}")
# Valid usage
valid_items = selector.xpath('//p[has-class("item", "valid")]')
print(f"Valid items found: {len(valid_items)}")XPath extension functions are optimized for repeated use:
Performance Example:
from parsel import Selector
# Large HTML document with many elements
html = """
<div class="container">
""" + "\n".join([
f'<p class="item type-{i % 3} {"active" if i % 5 == 0 else ""}">Item {i}</p>'
for i in range(1000)
]) + """
</div>
"""
selector = Selector(text=html)
# Efficient batch processing with has-class
# The function validation is cached for performance
active_items = selector.xpath('//p[has-class("item", "active")]')
print(f"Found {len(active_items)} active items")
# Extract specific type with active status
active_type_0 = selector.xpath('//p[has-class("item", "type-0", "active")]')
print(f"Active type-0 items: {len(active_type_0)}")XPath extension functions work seamlessly with standard XPath features:
html = """
<section class="products">
<div class="product featured premium">Premium Product A</div>
<div class="product featured">Featured Product B</div>
<div class="product premium">Premium Product C</div>
<div class="product">Basic Product D</div>
</section>
"""
selector = Selector(text=html)
# Combine with positional functions
first_featured = selector.xpath('(//div[has-class("product", "featured")])[1]')
first_featured_text = first_featured.xpath('.//text()').get()
# Returns: 'Premium Product A'
# Combine with text functions
premium_with_a = selector.xpath('//div[has-class("product", "premium") and contains(text(), "A")]')
# Combine with attribute checks
products_with_class = selector.xpath('//div[@class and has-class("product")]')
print(f"Products with class attribute: {len(products_with_class)}")
# Complex boolean logic
featured_or_premium = selector.xpath('//div[has-class("product") and (has-class("featured") or has-class("premium"))]')
print(f"Featured or premium products: {len(featured_or_premium)}")Install with Tessl CLI
npx tessl i tessl/pypi-parsel