tessl/pypi-parsel

Parsel is a library to extract data from HTML and XML using XPath and CSS selectors

—

Pending

Overview

Eval results

Files

Document Parsing and Selection

Name: tessl/pypi-parsel
Author: tessl

Core functionality for parsing HTML, XML, JSON, and text documents with unified selector interface supporting multiple query languages including XPath, CSS selectors, and JMESPath.

Capabilities

Selector Initialization

Create Selector instances from various input formats with configurable parsing options.

class Selector:
    def __init__(
        self,
        text: Optional[str] = None,
        type: Optional[str] = None,
        body: bytes = b"",
        encoding: str = "utf-8",
        namespaces: Optional[Mapping[str, str]] = None,
        root: Optional[Any] = None,
        base_url: Optional[str] = None,
        _expr: Optional[str] = None,
        huge_tree: bool = True,
    ) -> None:
        """
        Initialize a Selector for parsing and selecting from documents.

        Parameters:
        - text (str, optional): Text content to parse
        - type (str, optional): Document type - "html", "xml", "json", or "text"
        - body (bytes): Raw bytes content (alternative to text)
        - encoding (str): Character encoding for body content, defaults to "utf-8"
        - namespaces (dict, optional): XML namespace prefix mappings
        - root (Any, optional): Pre-parsed root element or data
        - base_url (str, optional): Base URL for resolving relative URLs
        - _expr (str, optional): Expression that created this selector
        - huge_tree (bool): Enable large document parsing support, defaults to True

        Raises:
        - ValueError: Invalid type or missing required arguments
        - TypeError: Invalid argument types
        """

Usage Example:

from parsel import Selector

# Parse HTML text
html_selector = Selector(text="<html><body><h1>Title</h1></body></html>")

# Parse XML with explicit type
xml_selector = Selector(text="<root><item>data</item></root>", type="xml")

# Parse JSON
json_selector = Selector(text='{"name": "value", "items": [1, 2, 3]}', type="json")

# Parse from bytes with encoding
bytes_selector = Selector(body=b"<html><body>Content</body></html>", encoding="utf-8")

# Parse with XML namespaces
ns_selector = Selector(
    text="<root xmlns:ns='http://example.com'><ns:item>data</ns:item></root>",
    type="xml",
    namespaces={"ns": "http://example.com"}
)

XPath Selection

Execute XPath expressions for precise element selection with namespace support and variable binding.

def xpath(
    self,
    query: str,
    namespaces: Optional[Mapping[str, str]] = None,
    **kwargs: Any,
) -> SelectorList["Selector"]:
    """
    Find nodes matching the XPath query.

    Parameters:
    - query (str): XPath expression to execute
    - namespaces (dict, optional): Additional namespace prefix mappings
    - **kwargs: Variable bindings for XPath variables

    Returns:
    SelectorList: Collection of matching Selector objects

    Raises:
    - ValueError: Invalid XPath expression or unsupported selector type
    - XPathError: XPath syntax or evaluation errors
    """

Usage Example:

selector = Selector(text="""
<html>
    <body>
        <div class="content">
            <p>First paragraph</p>
            <p>Second paragraph</p>
        </div>
        <a href="http://example.com">Link</a>
    </body>
</html>
""")

# Select all paragraphs
paragraphs = selector.xpath('//p')

# Select text content
text_nodes = selector.xpath('//p/text()')

# Select attributes
hrefs = selector.xpath('//a/@href')

# Use XPath variables
links = selector.xpath('//a[@href=$url]', url="http://example.com")

# Complex XPath expressions
content_divs = selector.xpath('//div[@class="content"]//p[position()>1]')

CSS Selection

Apply CSS selectors with support for pseudo-elements and advanced CSS features.

def css(self, query: str) -> SelectorList["Selector"]:
    """
    Apply CSS selector and return matching elements.

    Parameters:
    - query (str): CSS selector expression

    Returns:
    SelectorList: Collection of matching Selector objects

    Raises:
    - ValueError: Invalid CSS selector or unsupported selector type
    - ExpressionError: CSS syntax errors
    """

Usage Example:

selector = Selector(text="""
<html>
    <body>
        <div class="container">
            <h1 id="title">Main Title</h1>
            <p class="intro">Introduction text</p>
            <ul>
                <li><a href="link1.html">Link 1</a></li>
                <li><a href="link2.html">Link 2</a></li>
            </ul>
        </div>
    </body>
</html>
""")

# Select by class
intro = selector.css('.intro')

# Select by ID
title = selector.css('#title')

# Select descendants
links = selector.css('.container a')

# Pseudo-element selectors for text content
title_text = selector.css('h1::text')

# Pseudo-element selectors for attributes
link_urls = selector.css('a::attr(href)')

# Complex selectors
first_link = selector.css('ul li:first-child a')

JMESPath Selection

Query JSON data using JMESPath expressions for complex data extraction.

def jmespath(self, query: str, **kwargs: Any) -> SelectorList["Selector"]:
    """
    Find objects matching the JMESPath query for JSON data.

    Parameters:
    - query (str): JMESPath expression to apply
    - **kwargs: Additional options passed to jmespath.search()

    Returns:
    SelectorList: Collection of matching Selector objects with extracted data

    Note:
    - Works with JSON-type selectors or JSON content within HTML/XML elements
    - Results are wrapped in new Selector objects for chaining
    """

Usage Example:

# JSON document
json_text = '''
{
    "users": [
        {"name": "Alice", "age": 30, "email": "alice@example.com"},
        {"name": "Bob", "age": 25, "email": "bob@example.com"}
    ],
    "metadata": {
        "total": 2,
        "page": 1
    }
}
'''

selector = Selector(text=json_text, type="json")

# Extract all user names
names = selector.jmespath('users[*].name')

# Extract specific user
first_user = selector.jmespath('users[0]')

# Complex queries
adult_emails = selector.jmespath('users[?age >= `30`].email')

# Nested data extraction
metadata = selector.jmespath('metadata.total')

# JSON within HTML
html_with_json = """
<script type="application/json">
{"config": {"theme": "dark", "version": "1.0"}}
</script>
"""
html_selector = Selector(text=html_with_json)
theme = html_selector.css('script::text').jmespath('config.theme')