Parsel is a library to extract data from HTML and XML using XPath and CSS selectors
—
Batch operations on multiple selectors with chainable methods for filtering, extracting, and transforming collections of selected elements. SelectorList extends Python's list class with selector-specific functionality.
A list subclass containing multiple Selector objects with chainable selection methods.
class SelectorList(List["Selector"]):
"""
List of Selector objects with additional selection methods.
Supports all standard list operations plus selector-specific methods
for batch processing of multiple elements.
"""
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union["Selector", "SelectorList[Selector]"]:
"""
Get selector(s) by index or slice.
Parameters:
- pos: Index or slice object
Returns:
- Single Selector for index access
- New SelectorList for slice access
"""Apply selection queries across all selectors in the list.
def xpath(
self,
xpath: str,
namespaces: Optional[Mapping[str, str]] = None,
**kwargs: Any,
) -> "SelectorList[Selector]":
"""
Call xpath() on each element and return flattened results.
Parameters:
- xpath (str): XPath expression to apply
- namespaces (dict, optional): Namespace prefix mappings
- **kwargs: XPath variable bindings
Returns:
SelectorList: Flattened results from all elements
"""
def css(self, query: str) -> "SelectorList[Selector]":
"""
Call css() on each element and return flattened results.
Parameters:
- query (str): CSS selector to apply
Returns:
SelectorList: Flattened results from all elements
"""
def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[Selector]":
"""
Call jmespath() on each element and return flattened results.
Parameters:
- query (str): JMESPath expression to apply
- **kwargs: Additional jmespath options
Returns:
SelectorList: Flattened results from all elements
"""Usage Example:
from parsel import Selector
html = """
<div class="product">
<h2>Product 1</h2>
<div class="details">
<p class="price">$19.99</p>
<p class="rating">4.5 stars</p>
</div>
</div>
<div class="product">
<h2>Product 2</h2>
<div class="details">
<p class="price">$29.99</p>
<p class="rating">4.8 stars</p>
</div>
</div>
"""
selector = Selector(text=html)
# Get all product containers
products = selector.css('.product') # Returns SelectorList with 2 elements
# Chain selections - extract all prices from all products
all_prices = products.css('.price::text') # SelectorList with price texts
# Chain XPath - get all headings from all products
all_headings = products.xpath('.//h2/text()') # SelectorList with heading texts
# Further filter results
high_ratings = products.css('.rating:contains("4.8")') # Products with 4.8 ratingExtract content from all selectors in the list.
def get(self, default: Optional[str] = None) -> Optional[str]:
"""
Return get() result for the first element in the list.
Parameters:
- default (str, optional): Value if list is empty
Returns:
str or None: Content of first element or default
"""
def getall(self) -> List[str]:
"""
Call get() on each element and return all results.
Returns:
List[str]: Content from all elements in the list
"""
# Legacy aliases
extract_first = get
extract = getallUsage Example:
# Continuing from previous example
products = selector.css('.product')
# Get content from first product only
first_product_html = products.get()
# Get content from all products
all_product_html = products.getall() # List of HTML strings
# Extract all price values
price_texts = products.css('.price::text').getall()
# Returns: ['$19.99', '$29.99']
# Get first price only
first_price = products.css('.price::text').get()
# Returns: '$19.99'
# Get first price with default
first_price_safe = products.css('.nonexistent::text').get(default='$0.00')
# Returns: '$0.00' since no elements matchApply regular expressions across all selectors in the list.
def re(
self, regex: Union[str, Pattern[str]], replace_entities: bool = True
) -> List[str]:
"""
Call re() on each element and return flattened results.
Parameters:
- regex (str or Pattern): Regular expression pattern
- replace_entities (bool): Replace HTML entities
Returns:
List[str]: All regex matches from all elements
"""
def re_first(
self,
regex: Union[str, Pattern[str]],
default: Optional[str] = None,
replace_entities: bool = True,
) -> Optional[str]:
"""
Call re() on elements until first match is found.
Parameters:
- regex (str or Pattern): Regular expression pattern
- default (str, optional): Value if no matches found
- replace_entities (bool): Replace HTML entities
Returns:
str or None: First match across all elements or default
"""Usage Example:
# Extract all numeric values from all products
numbers = products.re(r'\\d+\\.\\d+')
# Returns: ['19.99', '4.5', '29.99', '4.8']
# Get first numeric value found
first_number = products.re_first(r'\\d+\\.\\d+')
# Returns: '19.99'
# Extract prices specifically
prices = products.css('.price').re(r'\\$([\\d.]+)')
# Returns: ['19.99', '29.99']
# Extract ratings
ratings = products.css('.rating').re(r'([\\d.]+) stars')
# Returns: ['4.5', '4.8']Access attributes from the first element in the list.
@property
def attrib(self) -> Mapping[str, str]:
"""
Return attributes dictionary for the first element.
Returns:
Mapping[str, str]: Attributes of first element, empty dict if list is empty
"""Usage Example:
html = """
<div class="item" data-id="1">Item 1</div>
<div class="item" data-id="2">Item 2</div>
"""
selector = Selector(text=html)
items = selector.css('.item')
# Get attributes of first item
first_item_attrs = items.attrib
# Returns: {'class': 'item', 'data-id': '1'}
# Access specific attribute
first_item_id = items.attrib.get('data-id')
# Returns: '1'Remove or modify elements in batch operations.
def drop(self) -> None:
"""
Drop all matched nodes from their parents.
Removes each element in the list from its parent in the DOM.
"""
def remove(self) -> None:
"""
Remove all matched nodes from their parents.
Deprecated: Use drop() instead.
"""Usage Example:
html = """
<div>
<p class="temp">Temporary content</p>
<p class="keep">Important content</p>
<p class="temp">Another temp</p>
</div>
"""
selector = Selector(text=html)
# Remove all temporary paragraphs
temp_elements = selector.css('.temp')
temp_elements.drop() # Removes both .temp elements
# Check remaining content
remaining = selector.css('p').getall()
# Only the .keep paragraph remainsSelectorList supports all standard Python list operations:
products = selector.css('.product')
# Length
count = len(products) # Number of selected elements
# Indexing
first_product = products[0] # First Selector
last_product = products[-1] # Last Selector
# Slicing
first_two = products[:2] # SelectorList with first 2 elements
even_products = products[::2] # Every other product
# Iteration
for product in products:
title = product.css('h2::text').get()
print(title)
# List comprehension
titles = [p.css('h2::text').get() for p in products]SelectorList methods return new SelectorList objects, enabling method chaining:
# Complex chaining example
product_details = (selector
.css('.product') # Get all products -> SelectorList
.css('.details') # Get details from each -> SelectorList
.xpath('.//p[contains(@class, "price")]') # Get price paragraphs -> SelectorList
.css('::text') # Get text content -> SelectorList
.re(r'\\$([\\d.]+)') # Extract price numbers -> List[str]
)Install with Tessl CLI
npx tessl i tessl/pypi-parsel