tessl/pypi-parsel

Parsel is a library to extract data from HTML and XML using XPath and CSS selectors

—

Pending

Overview

Eval results

Files

SelectorList Operations

Name: tessl/pypi-parsel
Author: tessl

Batch operations on multiple selectors with chainable methods for filtering, extracting, and transforming collections of selected elements. SelectorList extends Python's list class with selector-specific functionality.

Capabilities

SelectorList Class

A list subclass containing multiple Selector objects with chainable selection methods.

class SelectorList(List["Selector"]):
    """
    List of Selector objects with additional selection methods.
    
    Supports all standard list operations plus selector-specific methods
    for batch processing of multiple elements.
    """
    
    def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union["Selector", "SelectorList[Selector]"]:
        """
        Get selector(s) by index or slice.
        
        Parameters:
        - pos: Index or slice object
        
        Returns:
        - Single Selector for index access
        - New SelectorList for slice access
        """

Batch Selection Operations

Apply selection queries across all selectors in the list.

def xpath(
    self,
    xpath: str,
    namespaces: Optional[Mapping[str, str]] = None,
    **kwargs: Any,
) -> "SelectorList[Selector]":
    """
    Call xpath() on each element and return flattened results.

    Parameters:
    - xpath (str): XPath expression to apply
    - namespaces (dict, optional): Namespace prefix mappings
    - **kwargs: XPath variable bindings

    Returns:
    SelectorList: Flattened results from all elements
    """

def css(self, query: str) -> "SelectorList[Selector]":
    """
    Call css() on each element and return flattened results.

    Parameters:
    - query (str): CSS selector to apply

    Returns:
    SelectorList: Flattened results from all elements
    """

def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[Selector]":
    """
    Call jmespath() on each element and return flattened results.

    Parameters:
    - query (str): JMESPath expression to apply
    - **kwargs: Additional jmespath options

    Returns:
    SelectorList: Flattened results from all elements
    """

Usage Example:

from parsel import Selector

html = """
<div class="product">
    <h2>Product 1</h2>
    <div class="details">
        <p class="price">$19.99</p>
        <p class="rating">4.5 stars</p>
    </div>
</div>
<div class="product">
    <h2>Product 2</h2>
    <div class="details">
        <p class="price">$29.99</p>
        <p class="rating">4.8 stars</p>
    </div>
</div>
"""

selector = Selector(text=html)

# Get all product containers
products = selector.css('.product')  # Returns SelectorList with 2 elements

# Chain selections - extract all prices from all products
all_prices = products.css('.price::text')  # SelectorList with price texts

# Chain XPath - get all headings from all products  
all_headings = products.xpath('.//h2/text()')  # SelectorList with heading texts

# Further filter results
high_ratings = products.css('.rating:contains("4.8")')  # Products with 4.8 rating

Batch Content Extraction

Extract content from all selectors in the list.

def get(self, default: Optional[str] = None) -> Optional[str]:
    """
    Return get() result for the first element in the list.

    Parameters:
    - default (str, optional): Value if list is empty

    Returns:
    str or None: Content of first element or default
    """

def getall(self) -> List[str]:
    """
    Call get() on each element and return all results.

    Returns:
    List[str]: Content from all elements in the list
    """

# Legacy aliases
extract_first = get
extract = getall

Usage Example:

# Continuing from previous example
products = selector.css('.product')

# Get content from first product only
first_product_html = products.get()

# Get content from all products
all_product_html = products.getall()  # List of HTML strings

# Extract all price values
price_texts = products.css('.price::text').getall()
# Returns: ['$19.99', '$29.99']

# Get first price only
first_price = products.css('.price::text').get()
# Returns: '$19.99'

# Get first price with default
first_price_safe = products.css('.nonexistent::text').get(default='$0.00')
# Returns: '$0.00' since no elements match

Batch Regular Expression Operations

Apply regular expressions across all selectors in the list.

def re(
    self, regex: Union[str, Pattern[str]], replace_entities: bool = True
) -> List[str]:
    """
    Call re() on each element and return flattened results.

    Parameters:
    - regex (str or Pattern): Regular expression pattern
    - replace_entities (bool): Replace HTML entities

    Returns:
    List[str]: All regex matches from all elements
    """

def re_first(
    self,
    regex: Union[str, Pattern[str]],
    default: Optional[str] = None,
    replace_entities: bool = True,
) -> Optional[str]:
    """
    Call re() on elements until first match is found.

    Parameters:
    - regex (str or Pattern): Regular expression pattern  
    - default (str, optional): Value if no matches found
    - replace_entities (bool): Replace HTML entities

    Returns:
    str or None: First match across all elements or default
    """

Usage Example:

# Extract all numeric values from all products
numbers = products.re(r'\\d+\\.\\d+')
# Returns: ['19.99', '4.5', '29.99', '4.8']

# Get first numeric value found
first_number = products.re_first(r'\\d+\\.\\d+')
# Returns: '19.99'

# Extract prices specifically
prices = products.css('.price').re(r'\\$([\\d.]+)')
# Returns: ['19.99', '29.99']

# Extract ratings
ratings = products.css('.rating').re(r'([\\d.]+) stars')
# Returns: ['4.5', '4.8']

Attribute Access

Access attributes from the first element in the list.

@property
def attrib(self) -> Mapping[str, str]:
    """
    Return attributes dictionary for the first element.
    
    Returns:
    Mapping[str, str]: Attributes of first element, empty dict if list is empty
    """

Usage Example:

html = """
<div class="item" data-id="1">Item 1</div>
<div class="item" data-id="2">Item 2</div>
"""

selector = Selector(text=html)
items = selector.css('.item')

# Get attributes of first item
first_item_attrs = items.attrib
# Returns: {'class': 'item', 'data-id': '1'}

# Access specific attribute
first_item_id = items.attrib.get('data-id')
# Returns: '1'

Element Modification

Remove or modify elements in batch operations.

def drop(self) -> None:
    """
    Drop all matched nodes from their parents.
    
    Removes each element in the list from its parent in the DOM.
    """

def remove(self) -> None:
    """
    Remove all matched nodes from their parents.
    
    Deprecated: Use drop() instead.
    """

Usage Example:

html = """
<div>
    <p class="temp">Temporary content</p>
    <p class="keep">Important content</p>
    <p class="temp">Another temp</p>
</div>
"""

selector = Selector(text=html)

# Remove all temporary paragraphs
temp_elements = selector.css('.temp')
temp_elements.drop()  # Removes both .temp elements

# Check remaining content
remaining = selector.css('p').getall()
# Only the .keep paragraph remains

List Operations and Indexing

SelectorList supports all standard Python list operations:

products = selector.css('.product')

# Length
count = len(products)  # Number of selected elements

# Indexing
first_product = products[0]  # First Selector
last_product = products[-1]  # Last Selector

# Slicing
first_two = products[:2]  # SelectorList with first 2 elements
even_products = products[::2]  # Every other product

# Iteration
for product in products:
    title = product.css('h2::text').get()
    print(title)

# List comprehension
titles = [p.css('h2::text').get() for p in products]

Chaining Operations

SelectorList methods return new SelectorList objects, enabling method chaining:

# Complex chaining example
product_details = (selector
    .css('.product')                    # Get all products -> SelectorList
    .css('.details')                    # Get details from each -> SelectorList  
    .xpath('.//p[contains(@class, "price")]')  # Get price paragraphs -> SelectorList
    .css('::text')                      # Get text content -> SelectorList
    .re(r'\\$([\\d.]+)')                # Extract price numbers -> List[str]
)

Install with Tessl CLI