Universal feed parser for RSS, Atom, and CDF feeds with comprehensive format support and robust parsing capabilities
—
Feedparser provides comprehensive data structures for accessing feed content with normalized field names across different feed formats. The main result structure contains feed metadata, entries, and parsing information.
Enhanced dictionary providing attribute-style access and backward compatibility with legacy field names.
class FeedParserDict(dict):
"""
Enhanced dictionary with attribute access and legacy key mapping.
Provides backward compatibility by mapping old RSS field names to
modern equivalents and supports both dict-style and attribute-style access.
"""
def __getitem__(self, key):
"""
Get item with legacy key mapping support.
Special handling for:
- 'category': Returns first tag term
- 'enclosures': Returns links with rel='enclosure'
- 'license': Returns first license link href
- 'updated'/'updated_parsed': Falls back to published if not present
Returns:
Value for the key, with legacy key mapping applied
"""
def __contains__(self, key):
"""Check if key exists, with legacy mapping support."""
def get(self, key, default=None):
"""Get item with default, using legacy key mapping."""
def __getattr__(self, key):
"""Enable attribute-style access (result.feed.title)."""
def __setitem__(self, key, value):
"""Set item with legacy key mapping."""
def setdefault(self, k, default):
"""Set default value if key doesn't exist."""FeedParserDict automatically maps legacy RSS field names to modern equivalents:
# Legacy key mappings (automatically handled)
keymap = {
'channel': 'feed',
'items': 'entries',
'guid': 'id',
'date': 'updated',
'date_parsed': 'updated_parsed',
'description': ['summary', 'subtitle'],
'description_detail': ['summary_detail', 'subtitle_detail'],
'url': ['href'],
'modified': 'updated',
'modified_parsed': 'updated_parsed',
'issued': 'published',
'issued_parsed': 'published_parsed',
'copyright': 'rights',
'copyright_detail': 'rights_detail',
'tagline': 'subtitle',
'tagline_detail': 'subtitle_detail',
}The parse() function returns a FeedParserDict with these top-level properties:
# Parsing status and metadata
result = {
'bozo': bool, # True if feed had parsing issues
'bozo_exception': Exception, # Exception object if errors occurred
'encoding': str, # Character encoding used (e.g., 'utf-8')
'version': str, # Feed format version (e.g., 'rss20', 'atom10')
'namespaces': dict, # XML namespaces used in feed
}# HTTP response data (when parsing from URL)
result = {
'etag': str, # HTTP ETag header
'headers': dict, # All HTTP response headers
'href': str, # Final URL after redirects
'modified': str, # HTTP Last-Modified header
'status': int, # HTTP status code
}# Feed content structure
result = {
'feed': FeedParserDict, # Feed-level metadata
'entries': list, # List of entry/item FeedParserDict objects
}Feed metadata contains comprehensive information about the feed itself:
feed = {
'title': str, # Feed title
'title_detail': { # Detailed title information
'type': str, # Content type ('text', 'html', 'xhtml')
'language': str, # Language code
'base': str, # Base URI
'value': str, # Title content
},
'link': str, # Main feed/site URL
'links': [ # All feed links
{
'rel': str, # Relationship ('alternate', 'self', etc.)
'type': str, # MIME type
'href': str, # URL
'title': str, # Link title (optional)
}
],
'id': str, # Unique feed identifier
'description': str, # Feed description (RSS)
'subtitle': str, # Feed subtitle (Atom)
'subtitle_detail': dict, # Detailed subtitle information
'language': str, # Feed language code
}feed = {
'author': str, # Primary author name
'author_detail': { # Detailed author information
'name': str, # Author name
'email': str, # Author email
'href': str, # Author URL
},
'contributors': [ # List of contributor objects
{
'name': str,
'email': str,
'href': str,
}
],
'publisher': str, # Publisher name
'publisher_detail': { # Detailed publisher information
'name': str,
'email': str,
'href': str,
},
'generator': str, # Feed generator software
'generator_detail': { # Detailed generator information
'name': str,
'version': str,
'href': str,
},
}feed = {
'updated': str, # Last updated timestamp (string)
'updated_parsed': tuple, # Parsed time as 9-tuple in GMT
'published': str, # Publication timestamp (string)
'published_parsed': tuple, # Parsed publication time as 9-tuple
}feed = {
'rights': str, # Copyright/rights statement
'rights_detail': { # Detailed rights information
'type': str,
'language': str,
'base': str,
'value': str,
},
}feed = {
'image': { # Feed image/logo (RSS)
'title': str, # Image title
'url': str, # Image URL
'link': str, # Image link target
'width': int, # Image width
'height': int, # Image height
'description': str, # Image description
},
'icon': str, # Feed icon URL (Atom)
'logo': str, # Feed logo URL (Atom)
}feed = {
'ttl': int, # Time-to-live (cache duration in minutes)
'cloud': { # RSS cloud notification
'domain': str,
'port': int,
'path': str,
'registerprocedure': str,
'protocol': str,
},
'textinput': { # RSS text input box
'title': str,
'description': str,
'name': str,
'link': str,
},
'docs': str, # Documentation URL
}feed = {
'tags': [ # List of categories/tags
{
'term': str, # Category term
'scheme': str, # Category scheme/domain
'label': str, # Human-readable label
}
],
}Each entry/item in the feed contains detailed article information:
entry = {
'title': str, # Entry title
'title_detail': dict, # Detailed title information
'link': str, # Main entry URL
'links': list, # All entry links
'id': str, # Unique entry identifier
'summary': str, # Entry summary/description
'summary_detail': dict, # Detailed summary information
'content': [ # Entry content blocks
{
'type': str, # Content type ('text', 'html', 'xhtml')
'language': str, # Content language
'base': str, # Base URI
'value': str, # Content text
}
],
}entry = {
'author': str, # Primary author name
'author_detail': dict, # Detailed author information
'contributors': list, # List of contributor objects
'publisher': str, # Publisher name
'publisher_detail': dict, # Detailed publisher information
}entry = {
'updated': str, # Last updated timestamp
'updated_parsed': tuple, # Parsed updated time as 9-tuple
'published': str, # Publication timestamp
'published_parsed': tuple, # Parsed publication time as 9-tuple
'created': str, # Creation timestamp (rare)
'created_parsed': tuple, # Parsed creation time as 9-tuple
'expired': str, # Expiration timestamp (rare)
'expired_parsed': tuple, # Parsed expiration time as 9-tuple
}entry = {
'enclosures': [ # Attached files (podcasts, etc.)
{
'href': str, # File URL
'type': str, # MIME type
'length': str, # File size in bytes
}
],
}entry = {
'tags': [ # Entry categories/tags
{
'term': str, # Tag term
'scheme': str, # Tag scheme/domain
'label': str, # Human-readable label
}
],
}entry = {
'comments': str, # Comments URL
'license': str, # Content license URL
}entry = {
'source': { # Original source information
'title': str, # Source feed title
'href': str, # Source feed URL
'value': str, # Source description
},
}result = feedparser.parse(url)
# Feed information
print(f"Feed: {result.feed.title}")
print(f"Description: {result.feed.description}")
print(f"Last updated: {result.feed.updated}")
# Entry information
for entry in result.entries:
print(f"Title: {entry.title}")
print(f"Link: {entry.link}")
print(f"Published: {entry.published}")
print(f"Summary: {entry.summary}")# Both styles work identically
title1 = result.feed.title
title2 = result.feed['title']
title3 = result['feed']['title']
# All three methods return the same value
assert title1 == title2 == title3# Legacy RSS keys automatically map to modern equivalents
description = result.feed.description # RSS 'description'
subtitle = result.feed.subtitle # Atom 'subtitle'
# Both may return the same content depending on feed format
# Legacy item access
items = result.items # Maps to result.entries
guid = entry.guid # Maps to entry.id# Check content types for proper rendering
if entry.title_detail.type == 'html':
# Contains HTML markup
html_title = entry.title
elif entry.title_detail.type == 'text':
# Plain text only
text_title = entry.title
# Handle multiple content blocks
for content_block in entry.content:
if content_block.type == 'html':
html_content = content_block.value
elif content_block.type == 'text':
text_content = content_block.value# Use .get() for optional fields
author = entry.get('author', 'Unknown')
published = entry.get('published', 'Date not available')
# Check for field existence
if 'enclosures' in entry:
for enclosure in entry.enclosures:
print(f"Attachment: {enclosure.href}")
# Handle missing nested fields
if hasattr(entry, 'author_detail') and entry.author_detail:
email = entry.author_detail.get('email', 'No email')Install with Tessl CLI
npx tessl i tessl/pypi-feedparser