Parsel is a library to extract data from HTML and XML using XPath and CSS selectors
—
Utilities for converting CSS selectors to XPath expressions with support for pseudo-elements and custom CSS features. Parsel extends standard CSS selector capabilities with additional pseudo-elements for enhanced data extraction.
Convert CSS selectors to equivalent XPath expressions for internal processing.
def css2xpath(query: str) -> str:
"""
Convert CSS selector to XPath expression.
This is the main utility function for CSS-to-XPath translation using
the HTMLTranslator with pseudo-element support.
Parameters:
- query (str): CSS selector to convert
Returns:
str: Equivalent XPath expression
Examples:
- 'div.class' -> 'descendant-or-self::div[@class and contains(concat(" ", normalize-space(@class), " "), " class ")]'
- 'p::text' -> 'descendant-or-self::p/text()'
- 'a::attr(href)' -> 'descendant-or-self::a/@href'
"""Usage Example:
from parsel import css2xpath
# Basic element selectors
div_xpath = css2xpath('div')
# Returns: 'descendant-or-self::div'
# Class selectors
class_xpath = css2xpath('.container')
# Returns: 'descendant-or-self::*[@class and contains(concat(" ", normalize-space(@class), " "), " container ")]'
# ID selectors
id_xpath = css2xpath('#main')
# Returns: 'descendant-or-self::*[@id = "main"]'
# Attribute selectors
attr_xpath = css2xpath('input[type="text"]')
# Returns: 'descendant-or-self::input[@type = "text"]'
# Descendant selectors
desc_xpath = css2xpath('div p')
# Returns: 'descendant-or-self::div/descendant-or-self::p'
# Child selectors
child_xpath = css2xpath('ul > li')
# Returns: 'descendant-or-self::ul/li'
# Pseudo-element selectors (Parsel extension)
text_xpath = css2xpath('p::text')
# Returns: 'descendant-or-self::p/text()'
attr_xpath = css2xpath('a::attr(href)')
# Returns: 'descendant-or-self::a/@href'CSS to XPath translator for generic XML documents.
class GenericTranslator:
"""
CSS to XPath translator for generic XML documents.
Provides caching and pseudo-element support for XML parsing.
"""
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
"""
Convert CSS selector to XPath with caching.
Parameters:
- css (str): CSS selector to convert
- prefix (str): XPath prefix for the query
Returns:
str: XPath expression equivalent to CSS selector
Note:
- Results are cached (LRU cache with 256 entries)
- Supports pseudo-elements ::text and ::attr()
"""CSS to XPath translator optimized for HTML documents.
class HTMLTranslator:
"""
CSS to XPath translator optimized for HTML documents.
Provides HTML-specific optimizations and pseudo-element support.
"""
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
"""
Convert CSS selector to XPath with HTML optimizations.
Parameters:
- css (str): CSS selector to convert
- prefix (str): XPath prefix for the query
Returns:
str: XPath expression optimized for HTML parsing
Note:
- Cached results (LRU cache with 256 entries)
- HTML-specific case handling and optimizations
- Supports pseudo-elements ::text and ::attr()
"""Usage Example:
from parsel.csstranslator import GenericTranslator, HTMLTranslator
# Create translator instances
xml_translator = GenericTranslator()
html_translator = HTMLTranslator()
css_selector = 'article h2.title'
# Convert using XML translator
xml_xpath = xml_translator.css_to_xpath(css_selector)
# Returns XPath suitable for generic XML
# Convert using HTML translator
html_xpath = html_translator.css_to_xpath(css_selector)
# Returns XPath optimized for HTML parsing
# Both support pseudo-elements
text_css = 'p.content::text'
xml_text_xpath = xml_translator.css_to_xpath(text_css)
html_text_xpath = html_translator.css_to_xpath(text_css)
# Both return: 'descendant-or-self::p[@class and contains(...)]./text()'Enhanced XPath expressions with pseudo-element support.
class XPathExpr:
"""
Extended XPath expression with pseudo-element support.
Extends cssselect's XPathExpr to handle ::text and ::attr() pseudo-elements.
"""
textnode: bool = False
attribute: Optional[str] = None
@classmethod
def from_xpath(
cls,
xpath: "XPathExpr",
textnode: bool = False,
attribute: Optional[str] = None,
) -> "XPathExpr":
"""
Create XPathExpr from existing expression with pseudo-element flags.
Parameters:
- xpath: Base XPath expression
- textnode (bool): Whether to target text nodes
- attribute (str, optional): Attribute name to target
Returns:
XPathExpr: Extended expression with pseudo-element support
"""
def __str__(self) -> str:
"""
Convert to string representation with pseudo-element handling.
Returns:
str: XPath string with text() or @attribute suffixes as needed
"""Parsel extends CSS selectors with custom pseudo-elements for enhanced data extraction.
The ::text pseudo-element selects text content of elements.
Usage Example:
from parsel import Selector, css2xpath
html = """
<div class="content">
<h1>Main Title</h1>
<p>First paragraph with <em>emphasis</em> text.</p>
<p>Second paragraph.</p>
</div>
"""
selector = Selector(text=html)
# CSS with ::text pseudo-element
title_text = selector.css('h1::text').get()
# Returns: 'Main Title'
# Equivalent XPath (what css2xpath generates)
xpath_equivalent = css2xpath('h1::text')
# Returns: 'descendant-or-self::h1/text()'
# Manual XPath gives same result
manual_xpath = selector.xpath('//h1/text()').get()
# Returns: 'Main Title'
# Extract all text nodes from paragraphs
p_texts = selector.css('p::text').getall()
# Returns: ['First paragraph with ', 'Second paragraph.']
# Note: Excludes text from nested <em> elementThe ::attr(name) pseudo-element selects attribute values.
Usage Example:
html = """
<div class="links">
<a href="https://example.com" title="Example Site">Example</a>
<a href="https://google.com" title="Search Engine">Google</a>
<img src="image.jpg" alt="Description" width="300">
</div>
"""
selector = Selector(text=html)
# Extract href attributes using ::attr() pseudo-element
hrefs = selector.css('a::attr(href)').getall()
# Returns: ['https://example.com', 'https://google.com']
# Extract title attributes
titles = selector.css('a::attr(title)').getall()
# Returns: ['Example Site', 'Search Engine']
# Extract image attributes
img_src = selector.css('img::attr(src)').get()
# Returns: 'image.jpg'
img_alt = selector.css('img::attr(alt)').get()
# Returns: 'Description'
# Check XPath conversion
attr_xpath = css2xpath('a::attr(href)')
# Returns: 'descendant-or-self::a/@href'
# Equivalent manual XPath
manual_hrefs = selector.xpath('//a/@href').getall()
# Returns: ['https://example.com', 'https://google.com']Combine pseudo-elements with other CSS selector features.
Usage Example:
html = """
<article>
<header>
<h1 class="title">Article Title</h1>
<p class="meta">Published on <time datetime="2024-01-15">January 15, 2024</time></p>
</header>
<section class="content">
<p class="intro">Introduction paragraph.</p>
<p class="body">Main content paragraph.</p>
</section>
<footer>
<a href="/author/john" class="author-link">John Doe</a>
</footer>
</article>
"""
selector = Selector(text=html)
# Complex selectors with pseudo-elements
article_title = selector.css('header h1.title::text').get()
# Returns: 'Article Title'
# Get datetime attribute from time element within meta paragraph
datetime_attr = selector.css('.meta time::attr(datetime)').get()
# Returns: '2024-01-15'
# Get author link URL
author_url = selector.css('footer .author-link::attr(href)').get()
# Returns: '/author/john'
# Get content paragraph texts (excluding intro)
content_texts = selector.css('section.content p.body::text').getall()
# Returns: ['Main content paragraph.']
# Combine descendant and pseudo-element selectors
intro_text = selector.css('article section .intro::text').get()
# Returns: 'Introduction paragraph.'Both GenericTranslator and HTMLTranslator use LRU caching for performance.
# Cache configuration
@lru_cache(maxsize=256)
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
# Translation logic with cachingThe translation process handles pseudo-elements through dynamic dispatch:
Install with Tessl CLI
npx tessl i tessl/pypi-parsel