Parsel is a library to extract data from HTML and XML using XPath and CSS selectors
—
Methods for extracting text content, attributes, and serialized data from selected elements with support for entity replacement, regex matching, and various output formats.
Extract the full content of selected elements as strings with proper formatting.
def get(self) -> Any:
"""
Serialize and return the matched node content.
Returns:
- For HTML/XML: String representation with percent-encoded content unquoted
- For JSON/text: Raw data as-is
- For boolean values: "1" for True, "0" for False
- For other types: String conversion
Note:
- Uses appropriate serialization method based on document type
- Preserves XML/HTML structure in output
"""
def getall(self) -> List[str]:
"""
Serialize and return the matched node in a 1-element list.
Returns:
List[str]: Single-element list containing serialized content
"""
# Legacy alias
extract = getUsage Example:
from parsel import Selector
html = """
<div class="content">
<p>First <strong>bold</strong> paragraph</p>
<p>Second paragraph</p>
</div>
"""
selector = Selector(text=html)
# Extract full element with tags
full_content = selector.css('.content').get()
# Returns: '<div class="content">\\n <p>First <strong>bold</strong> paragraph</p>\\n <p>Second paragraph</p>\\n</div>'
# Extract text content only
text_only = selector.css('.content p::text').getall()
# Returns: ['First ', 'Second paragraph']
# Extract as single item
first_text = selector.css('.content p::text').get()
# Returns: 'First 'Apply regular expressions to extracted content with optional entity replacement.
def re(
self, regex: Union[str, Pattern[str]], replace_entities: bool = True
) -> List[str]:
"""
Apply regex and return list of matching strings.
Parameters:
- regex (str or Pattern): Regular expression pattern
- replace_entities (bool): Replace HTML entities except & and <
Returns:
List[str]: All regex matches from the content
Extraction rules:
- Named group "extract": Returns only the named group content
- Multiple numbered groups: Returns all groups flattened
- No groups: Returns entire regex matches
"""
def re_first(
self,
regex: Union[str, Pattern[str]],
default: Optional[str] = None,
replace_entities: bool = True,
) -> Optional[str]:
"""
Apply regex and return first matching string.
Parameters:
- regex (str or Pattern): Regular expression pattern
- default (str, optional): Value to return if no match found
- replace_entities (bool): Replace HTML entities except & and <
Returns:
str or None: First match or default value
"""Usage Example:
html = """
<div>
<p>Price: $25.99</p>
<p>Discount: 15%</p>
<p>Contact: user@example.com</p>
</div>
"""
selector = Selector(text=html)
# Extract all numbers
numbers = selector.css('div').re(r'\\d+\\.?\\d*')
# Returns: ['25.99', '15']
# Extract email addresses
emails = selector.css('div').re(r'[\\w.-]+@[\\w.-]+\\.\\w+')
# Returns: ['user@example.com']
# Extract with named groups
prices = selector.css('div').re(r'Price: \\$(?P<extract>\\d+\\.\\d+)')
# Returns: ['25.99']
# Get first match with default
first_number = selector.css('div').re_first(r'\\d+', default='0')
# Returns: '25'
# Extract from specific elements
contact_email = selector.css('p:contains("Contact")').re_first(r'[\\w.-]+@[\\w.-]+\\.\\w+')
# Returns: 'user@example.com'Access element attributes through the attrib property.
@property
def attrib(self) -> Dict[str, str]:
"""
Return the attributes dictionary for underlying element.
Returns:
Dict[str, str]: All attributes as key-value pairs
Note:
- Empty dict for non-element nodes
- Converts lxml attrib to standard dict
"""Usage Example:
html = """
<div class="container" id="main" data-value="123">
<a href="https://example.com" target="_blank" title="External Link">Link</a>
<img src="image.jpg" alt="Description" width="300" height="200">
</div>
"""
selector = Selector(text=html)
# Get all attributes of div
div_attrs = selector.css('div').attrib
# Returns: {'class': 'container', 'id': 'main', 'data-value': '123'}
# Get all attributes of link
link_attrs = selector.css('a').attrib
# Returns: {'href': 'https://example.com', 'target': '_blank', 'title': 'External Link'}
# Access specific attribute values
href_value = selector.css('a').attrib.get('href')
# Returns: 'https://example.com'
# Check for attribute existence
has_target = 'target' in selector.css('a').attrib
# Returns: TrueControl HTML entity replacement in text extraction.
Usage Example:
html = """
<p>Price: < $100 & shipping included ></p>
<p>Copyright © 2024</p>
"""
selector = Selector(text=html)
# With entity replacement (default)
text_with_entities = selector.css('p').re(r'.+', replace_entities=True)
# Returns: ['Price: < $100 & shipping included >', 'Copyright © 2024']
# Without entity replacement
text_raw = selector.css('p').re(r'.+', replace_entities=False)
# Returns: ['Price: < $100 & shipping included >', 'Copyright © 2024']
# Specific entities are preserved (& and <)
mixed_content = selector.css('p:first-child').re(r'.+', replace_entities=True)
# Returns: ['Price: < $100 & shipping included >']Different content types return appropriate data formats:
get() for single values, getall() for multiple valuesInstall with Tessl CLI
npx tessl i tessl/pypi-parsel