Parsel is a library to extract data from HTML and XML using XPath and CSS selectors
npx @tessl/cli install tessl/pypi-parsel@1.10.0Parsel is a library to extract data from HTML, XML, and JSON documents using XPath and CSS selectors. It provides a unified API through the Selector and SelectorList classes that enables developers to chain operations and extract data from web documents efficiently with support for XPath expressions, CSS selectors, JMESPath for JSON, and regular expressions.
pip install parselfrom parsel import Selector, SelectorListDirect module imports:
from parsel import css2xpath
from parsel import xpathfuncsfrom parsel import Selector
# Parse HTML document
html = """
<html>
<body>
<h1>Hello, Parsel!</h1>
<ul>
<li><a href="http://example.com">Link 1</a></li>
<li><a href="http://scrapy.org">Link 2</a></li>
</ul>
<script type="application/json">{"a": ["b", "c"]}</script>
</body>
</html>
"""
selector = Selector(text=html)
# Extract text using CSS selectors
title = selector.css('h1::text').get() # 'Hello, Parsel!'
# Extract links using XPath
for li in selector.css('ul > li'):
href = li.xpath('.//@href').get()
print(href)
# Extract and parse JSON content
json_data = selector.css('script::text').jmespath("a").getall() # ['b', 'c']
# Use regular expressions
words = selector.xpath('//h1/text()').re(r'\\w+') # ['Hello', 'Parsel']Parsel's architecture centers around two main classes:
The library supports multiple parsing strategies:
Core functionality for parsing HTML, XML, JSON, and text documents with unified selector interface supporting multiple query languages.
class Selector:
def __init__(
self,
text: Optional[str] = None,
type: Optional[str] = None,
body: bytes = b"",
encoding: str = "utf-8",
namespaces: Optional[Mapping[str, str]] = None,
root: Optional[Any] = None,
base_url: Optional[str] = None,
_expr: Optional[str] = None,
huge_tree: bool = True,
) -> None: ...
def xpath(
self,
query: str,
namespaces: Optional[Mapping[str, str]] = None,
**kwargs: Any,
) -> SelectorList["Selector"]: ...
def css(self, query: str) -> SelectorList["Selector"]: ...
def jmespath(self, query: str, **kwargs: Any) -> SelectorList["Selector"]: ...Document Parsing and Selection
Methods for extracting text content, attributes, and serialized data from selected elements with support for entity replacement and formatting.
def get(self) -> Any: ...
def getall(self) -> List[str]: ...
def re(
self, regex: Union[str, Pattern[str]], replace_entities: bool = True
) -> List[str]: ...
def re_first(
self,
regex: Union[str, Pattern[str]],
default: Optional[str] = None,
replace_entities: bool = True,
) -> Optional[str]: ...
@property
def attrib(self) -> Dict[str, str]: ...Batch operations on multiple selectors with chainable methods for filtering, extracting, and transforming collections of selected elements.
class SelectorList(List["Selector"]):
def xpath(
self,
xpath: str,
namespaces: Optional[Mapping[str, str]] = None,
**kwargs: Any,
) -> "SelectorList[Selector]": ...
def css(self, query: str) -> "SelectorList[Selector]": ...
def jmespath(self, query: str, **kwargs: Any) -> "SelectorList[Selector]": ...
def get(self, default: Optional[str] = None) -> Optional[str]: ...
def getall(self) -> List[str]: ...Functionality for working with XML namespaces including registration, removal, and namespace-aware queries.
def register_namespace(self, prefix: str, uri: str) -> None: ...
def remove_namespaces(self) -> None: ...Methods for removing and modifying DOM elements within the parsed document structure.
def drop(self) -> None: ...
def remove(self) -> None: ... # deprecatedUtilities for converting CSS selectors to XPath expressions with support for pseudo-elements and custom CSS features.
def css2xpath(query: str) -> str: ...
class GenericTranslator:
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: ...
class HTMLTranslator:
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: ...Custom XPath functions for enhanced element selection including CSS class checking and other utility functions.
def set_xpathfunc(fname: str, func: Optional[Callable]) -> None: ...
def has_class(context: Any, *classes: str) -> bool: ...
def setup() -> None: ...# Type aliases
_SelectorType = TypeVar("_SelectorType", bound="Selector")
_ParserType = Union[etree.XMLParser, etree.HTMLParser]
_TostringMethodType = Literal["html", "xml"]
# Exception classes
class CannotRemoveElementWithoutRoot(Exception): ...
class CannotRemoveElementWithoutParent(Exception): ...
class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent): ...
# CSS Translator classes
class XPathExpr:
textnode: bool
attribute: Optional[str]
@classmethod
def from_xpath(
cls,
xpath: "XPathExpr",
textnode: bool = False,
attribute: Optional[str] = None
) -> "XPathExpr": ...