Parsel is a library to extract data from HTML and XML using XPath and CSS selectors
—
Core functionality for parsing HTML, XML, JSON, and text documents with unified selector interface supporting multiple query languages including XPath, CSS selectors, and JMESPath.
Create Selector instances from various input formats with configurable parsing options.
class Selector:
def __init__(
self,
text: Optional[str] = None,
type: Optional[str] = None,
body: bytes = b"",
encoding: str = "utf-8",
namespaces: Optional[Mapping[str, str]] = None,
root: Optional[Any] = None,
base_url: Optional[str] = None,
_expr: Optional[str] = None,
huge_tree: bool = True,
) -> None:
"""
Initialize a Selector for parsing and selecting from documents.
Parameters:
- text (str, optional): Text content to parse
- type (str, optional): Document type - "html", "xml", "json", or "text"
- body (bytes): Raw bytes content (alternative to text)
- encoding (str): Character encoding for body content, defaults to "utf-8"
- namespaces (dict, optional): XML namespace prefix mappings
- root (Any, optional): Pre-parsed root element or data
- base_url (str, optional): Base URL for resolving relative URLs
- _expr (str, optional): Expression that created this selector
- huge_tree (bool): Enable large document parsing support, defaults to True
Raises:
- ValueError: Invalid type or missing required arguments
- TypeError: Invalid argument types
"""Usage Example:
from parsel import Selector
# Parse HTML text
html_selector = Selector(text="<html><body><h1>Title</h1></body></html>")
# Parse XML with explicit type
xml_selector = Selector(text="<root><item>data</item></root>", type="xml")
# Parse JSON
json_selector = Selector(text='{"name": "value", "items": [1, 2, 3]}', type="json")
# Parse from bytes with encoding
bytes_selector = Selector(body=b"<html><body>Content</body></html>", encoding="utf-8")
# Parse with XML namespaces
ns_selector = Selector(
text="<root xmlns:ns='http://example.com'><ns:item>data</ns:item></root>",
type="xml",
namespaces={"ns": "http://example.com"}
)Execute XPath expressions for precise element selection with namespace support and variable binding.
def xpath(
self,
query: str,
namespaces: Optional[Mapping[str, str]] = None,
**kwargs: Any,
) -> SelectorList["Selector"]:
"""
Find nodes matching the XPath query.
Parameters:
- query (str): XPath expression to execute
- namespaces (dict, optional): Additional namespace prefix mappings
- **kwargs: Variable bindings for XPath variables
Returns:
SelectorList: Collection of matching Selector objects
Raises:
- ValueError: Invalid XPath expression or unsupported selector type
- XPathError: XPath syntax or evaluation errors
"""Usage Example:
selector = Selector(text="""
<html>
<body>
<div class="content">
<p>First paragraph</p>
<p>Second paragraph</p>
</div>
<a href="http://example.com">Link</a>
</body>
</html>
""")
# Select all paragraphs
paragraphs = selector.xpath('//p')
# Select text content
text_nodes = selector.xpath('//p/text()')
# Select attributes
hrefs = selector.xpath('//a/@href')
# Use XPath variables
links = selector.xpath('//a[@href=$url]', url="http://example.com")
# Complex XPath expressions
content_divs = selector.xpath('//div[@class="content"]//p[position()>1]')Apply CSS selectors with support for pseudo-elements and advanced CSS features.
def css(self, query: str) -> SelectorList["Selector"]:
"""
Apply CSS selector and return matching elements.
Parameters:
- query (str): CSS selector expression
Returns:
SelectorList: Collection of matching Selector objects
Raises:
- ValueError: Invalid CSS selector or unsupported selector type
- ExpressionError: CSS syntax errors
"""Usage Example:
selector = Selector(text="""
<html>
<body>
<div class="container">
<h1 id="title">Main Title</h1>
<p class="intro">Introduction text</p>
<ul>
<li><a href="link1.html">Link 1</a></li>
<li><a href="link2.html">Link 2</a></li>
</ul>
</div>
</body>
</html>
""")
# Select by class
intro = selector.css('.intro')
# Select by ID
title = selector.css('#title')
# Select descendants
links = selector.css('.container a')
# Pseudo-element selectors for text content
title_text = selector.css('h1::text')
# Pseudo-element selectors for attributes
link_urls = selector.css('a::attr(href)')
# Complex selectors
first_link = selector.css('ul li:first-child a')Query JSON data using JMESPath expressions for complex data extraction.
def jmespath(self, query: str, **kwargs: Any) -> SelectorList["Selector"]:
"""
Find objects matching the JMESPath query for JSON data.
Parameters:
- query (str): JMESPath expression to apply
- **kwargs: Additional options passed to jmespath.search()
Returns:
SelectorList: Collection of matching Selector objects with extracted data
Note:
- Works with JSON-type selectors or JSON content within HTML/XML elements
- Results are wrapped in new Selector objects for chaining
"""Usage Example:
# JSON document
json_text = '''
{
"users": [
{"name": "Alice", "age": 30, "email": "alice@example.com"},
{"name": "Bob", "age": 25, "email": "bob@example.com"}
],
"metadata": {
"total": 2,
"page": 1
}
}
'''
selector = Selector(text=json_text, type="json")
# Extract all user names
names = selector.jmespath('users[*].name')
# Extract specific user
first_user = selector.jmespath('users[0]')
# Complex queries
adult_emails = selector.jmespath('users[?age >= `30`].email')
# Nested data extraction
metadata = selector.jmespath('metadata.total')
# JSON within HTML
html_with_json = """
<script type="application/json">
{"config": {"theme": "dark", "version": "1.0"}}
</script>
"""
html_selector = Selector(text=html_with_json)
theme = html_selector.css('script::text').jmespath('config.theme')Parsel automatically detects document types or allows explicit specification:
Auto-detection works by examining content structure:
Install with Tessl CLI
npx tessl i tessl/pypi-parsel