Universal feed parser for RSS, Atom, and CDF feeds with comprehensive format support and robust parsing capabilities
npx @tessl/cli install tessl/pypi-feedparser@6.0.0A universal Python library for parsing RSS, Atom, and CDF feeds with comprehensive format support. Feedparser handles multiple feed formats (RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, Atom 1.0) and provides robust parsing capabilities with automatic encoding detection, HTML sanitization, and graceful error handling.
pip install feedparserimport feedparserimport feedparser
# Parse a feed from URL
result = feedparser.parse('https://example.com/feed.xml')
# Access feed metadata
print(result.feed.title)
print(result.feed.description)
print(result.feed.link)
# Access entries/items
for entry in result.entries:
print(entry.title)
print(entry.summary)
print(entry.link)
print(entry.published)
# Check for parsing errors
if result.bozo:
print(f"Feed had parsing issues: {result.bozo_exception}")Feedparser uses a flexible parsing architecture that supports both strict XML parsing and lenient HTML-style parsing:
Main feed parsing functionality with support for multiple input sources, HTTP features, and extensive configuration options.
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
"""
Parse a feed from URL, file, stream, or string.
Args:
url_file_stream_or_string: Feed source (URL, file path, file-like object, or string)
etag (str, optional): HTTP ETag for conditional requests
modified (str/datetime/tuple, optional): Last-Modified date for conditional requests
agent (str, optional): HTTP User-Agent header
referrer (str, optional): HTTP Referer header
handlers (list, optional): Custom urllib handlers
request_headers (dict, optional): Additional HTTP request headers
response_headers (dict, optional): Override/supplement response headers
resolve_relative_uris (bool, optional): Enable relative URI resolution
sanitize_html (bool, optional): Enable HTML sanitization
Returns:
FeedParserDict: Parsed feed data with feed metadata and entries
"""Comprehensive feed data structures with normalized access to feed metadata, entries, and all feed elements across different formats.
class FeedParserDict(dict):
"""Enhanced dictionary with attribute access and legacy key mapping."""
def __getitem__(self, key): ...
def __contains__(self, key): ...
def get(self, key, default=None): ...
def __getattr__(self, key): ...Date parsing system supporting multiple date formats with extensible custom date handler registration.
def registerDateHandler(func):
"""
Register a custom date handler function.
Args:
func: Function that takes date string, returns 9-tuple date in GMT
"""HTTP client capabilities including conditional requests, authentication, custom headers, and redirect handling.
# Configuration constants
USER_AGENT: str # Default HTTP User-Agent header
RESOLVE_RELATIVE_URIS: int # Global URI resolution setting
SANITIZE_HTML: int # Global HTML sanitization setting
# Package metadata constants
__author__: str # Package author information
__license__: str # Package license type
__version__: str # Package version stringException system for parsing errors, encoding issues, and malformed content with graceful degradation.
class ThingsNobodyCaresAboutButMe(Exception): ...
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): ...
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): ...
class NonXMLContentType(ThingsNobodyCaresAboutButMe): ...
class UndeclaredNamespace(Exception): ...# Feed parsing result structure
FeedParserDict = {
'bozo': bool, # True if feed had parsing issues
'bozo_exception': Exception, # Exception if parsing errors occurred
'encoding': str, # Character encoding used
'etag': str, # HTTP ETag from response
'headers': dict, # HTTP response headers
'href': str, # Final URL after redirects
'modified': str, # HTTP Last-Modified header
'namespaces': dict, # XML namespaces used
'status': int, # HTTP status code
'version': str, # Feed format version (e.g., 'rss20', 'atom10')
'entries': list, # List of entry/item dictionaries
'feed': dict, # Feed-level metadata
}