Parsel is a library to extract data from HTML and XML using XPath and CSS selectors
—
Functionality for working with XML namespaces including registration, removal, and namespace-aware queries. Essential for parsing XML documents with namespace declarations.
Register XML namespaces for use in XPath expressions.
def register_namespace(self, prefix: str, uri: str) -> None:
"""
Register namespace prefix for use in XPath expressions.
Parameters:
- prefix (str): Namespace prefix to register
- uri (str): Namespace URI
Note:
- Registered namespaces persist for the lifetime of the Selector
- Allows XPath expressions to use registered prefixes
- Does not affect document structure, only query capability
"""Usage Example:
from parsel import Selector
xml_content = """
<root xmlns:books="http://example.com/books"
xmlns:authors="http://example.com/authors">
<books:catalog>
<books:book id="1">
<books:title>Python Guide</books:title>
<authors:author>John Doe</authors:author>
</books:book>
<books:book id="2">
<books:title>Web Scraping</books:title>
<authors:author>Jane Smith</authors:author>
</books:book>
</books:catalog>
</root>
"""
selector = Selector(text=xml_content, type="xml")
# Register namespaces for XPath queries
selector.register_namespace('b', 'http://example.com/books')
selector.register_namespace('a', 'http://example.com/authors')
# Now can use registered prefixes in XPath
books = selector.xpath('//b:book')
titles = selector.xpath('//b:title/text()').getall()
# Returns: ['Python Guide', 'Web Scraping']
authors = selector.xpath('//a:author/text()').getall()
# Returns: ['John Doe', 'Jane Smith']
# Use registered namespaces in attribute selection
book_ids = selector.xpath('//b:book/@id').getall()
# Returns: ['1', '2']Parsel includes built-in namespace registrations for common standards.
# Built-in namespace registrations
_default_namespaces = {
"re": "http://exslt.org/regular-expressions",
"set": "http://exslt.org/sets",
}Usage Example:
# Built-in 're' namespace for regex functions in XPath
xml_with_data = """
<items>
<item>Product ABC-123</item>
<item>Product XYZ-456</item>
<item>Service DEF-789</item>
</items>
"""
selector = Selector(text=xml_with_data, type="xml")
# Use built-in 're' namespace for regex matching
products_only = selector.xpath('//item[re:match(text(), "Product.*")]')
product_texts = products_only.xpath('.//text()').getall()
# Returns: ['Product ABC-123', 'Product XYZ-456']
# Extract codes using regex
codes = selector.xpath('//item/text()[re:match(., ".*-(\d+)")]')Remove all namespace declarations from XML documents for simplified processing.
def remove_namespaces(self) -> None:
"""
Remove all namespaces from the document.
This operation:
- Removes namespace prefixes from element and attribute names
- Removes namespace declarations
- Enables namespace-less XPath queries
- Modifies the document structure permanently
Note:
- Irreversible operation on the current Selector
- Useful when namespace complexity interferes with data extraction
- Use with caution as it changes document semantics
"""Usage Example:
xml_with_namespaces = """
<root xmlns:product="http://example.com/product"
xmlns:meta="http://example.com/metadata">
<product:catalog meta:version="1.0">
<product:item product:id="123" meta:created="2024-01-01">
<product:name>Widget</product:name>
<product:price>19.99</product:price>
</product:item>
</product:catalog>
</root>
"""
selector = Selector(text=xml_with_namespaces, type="xml")
# Before namespace removal - requires namespace registration
selector.register_namespace('p', 'http://example.com/product')
selector.register_namespace('m', 'http://example.com/metadata')
names_with_ns = selector.xpath('//p:name/text()').getall()
# Remove all namespaces
selector.remove_namespaces()
# After namespace removal - simple XPath works
names_without_ns = selector.xpath('//name/text()').getall()
# Returns: ['Widget']
# Attributes also lose namespace prefixes
item_id = selector.xpath('//item/@id').get()
# Returns: '123'
# All namespace-prefixed elements become simple elements
all_items = selector.xpath('//item')
all_catalogs = selector.xpath('//catalog')Use namespaces in XPath expressions with proper prefix handling.
Usage Example:
# Complex XML with multiple namespaces
complex_xml = """
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"
xmlns:web="http://example.com/webservice">
<soap:Header>
<web:Authentication>
<web:Token>abc123</web:Token>
</web:Authentication>
</soap:Header>
<soap:Body>
<web:GetDataResponse>
<web:Data>
<web:Record id="1">
<web:Name>Alice</web:Name>
<web:Age>30</web:Age>
</web:Record>
<web:Record id="2">
<web:Name>Bob</web:Name>
<web:Age>25</web:Age>
</web:Record>
</web:Data>
</web:GetDataResponse>
</soap:Body>
</soap:Envelope>
"""
selector = Selector(text=complex_xml, type="xml")
# Register both namespaces
selector.register_namespace('soap', 'http://schemas.xmlsoap.org/soap/envelope/')
selector.register_namespace('web', 'http://example.com/webservice')
# Extract authentication token
token = selector.xpath('//web:Token/text()').get()
# Returns: 'abc123'
# Extract all record data
records = selector.xpath('//web:Record')
for record in records:
record_id = record.xpath('./@id').get()
name = record.xpath('.//web:Name/text()').get()
age = record.xpath('.//web:Age/text()').get()
print(f"Record {record_id}: {name}, age {age}")
# Extract names using registered namespaces
all_names = selector.xpath('//web:Name/text()').getall()
# Returns: ['Alice', 'Bob']Pass namespaces to individual XPath queries without permanent registration.
Usage Example:
xml_content = """
<root xmlns:temp="http://temp.namespace.com">
<temp:data>
<temp:item>Value 1</temp:item>
<temp:item>Value 2</temp:item>
</temp:data>
</root>
"""
selector = Selector(text=xml_content, type="xml")
# Pass namespaces directly to xpath() call
temp_namespaces = {'temp': 'http://temp.namespace.com'}
items = selector.xpath('//temp:item/text()', namespaces=temp_namespaces).getall()
# Returns: ['Value 1', 'Value 2']
# Combine registered and runtime namespaces
selector.register_namespace('root', 'http://temp.namespace.com')
# Runtime namespaces supplement registered ones
data = selector.xpath('//root:data', namespaces={'extra': 'http://extra.com'})Install with Tessl CLI
npx tessl i tessl/pypi-parsel