tessl/pypi-feedparser

Universal feed parser for RSS, Atom, and CDF feeds with comprehensive format support and robust parsing capabilities

—

Pending

Overview

Eval results

Files

Data Structures

Name: tessl/pypi-feedparser
Author: tessl

Feedparser provides comprehensive data structures for accessing feed content with normalized field names across different feed formats. The main result structure contains feed metadata, entries, and parsing information.

Capabilities

FeedParserDict Class

Enhanced dictionary providing attribute-style access and backward compatibility with legacy field names.

class FeedParserDict(dict):
    """
    Enhanced dictionary with attribute access and legacy key mapping.
    
    Provides backward compatibility by mapping old RSS field names to 
    modern equivalents and supports both dict-style and attribute-style access.
    """
    
    def __getitem__(self, key):
        """
        Get item with legacy key mapping support.
        
        Special handling for:
        - 'category': Returns first tag term
        - 'enclosures': Returns links with rel='enclosure'
        - 'license': Returns first license link href
        - 'updated'/'updated_parsed': Falls back to published if not present
        
        Returns:
            Value for the key, with legacy key mapping applied
        """
    
    def __contains__(self, key):
        """Check if key exists, with legacy mapping support."""
    
    def get(self, key, default=None):
        """Get item with default, using legacy key mapping."""
    
    def __getattr__(self, key):
        """Enable attribute-style access (result.feed.title)."""
    
    def __setitem__(self, key, value):
        """Set item with legacy key mapping."""
        
    def setdefault(self, k, default):
        """Set default value if key doesn't exist."""

Legacy Key Mapping

FeedParserDict automatically maps legacy RSS field names to modern equivalents:

# Legacy key mappings (automatically handled)
keymap = {
    'channel': 'feed',
    'items': 'entries', 
    'guid': 'id',
    'date': 'updated',
    'date_parsed': 'updated_parsed',
    'description': ['summary', 'subtitle'],
    'description_detail': ['summary_detail', 'subtitle_detail'],
    'url': ['href'],
    'modified': 'updated',
    'modified_parsed': 'updated_parsed',
    'issued': 'published',
    'issued_parsed': 'published_parsed',
    'copyright': 'rights',
    'copyright_detail': 'rights_detail',
    'tagline': 'subtitle',
    'tagline_detail': 'subtitle_detail',
}

Top-Level Result Structure

The parse() function returns a FeedParserDict with these top-level properties:

Parsing Information

# Parsing status and metadata
result = {
    'bozo': bool,           # True if feed had parsing issues
    'bozo_exception': Exception,  # Exception object if errors occurred
    'encoding': str,        # Character encoding used (e.g., 'utf-8')
    'version': str,         # Feed format version (e.g., 'rss20', 'atom10')
    'namespaces': dict,     # XML namespaces used in feed
}

HTTP Information

# HTTP response data (when parsing from URL)
result = {
    'etag': str,           # HTTP ETag header
    'headers': dict,       # All HTTP response headers
    'href': str,           # Final URL after redirects
    'modified': str,       # HTTP Last-Modified header
    'status': int,         # HTTP status code
}

Feed Content

# Feed content structure
result = {
    'feed': FeedParserDict,    # Feed-level metadata
    'entries': list,           # List of entry/item FeedParserDict objects
}

Feed-Level Structure (result.feed)

Feed metadata contains comprehensive information about the feed itself:

Identity and Basic Information

feed = {
    'title': str,              # Feed title
    'title_detail': {          # Detailed title information
        'type': str,           # Content type ('text', 'html', 'xhtml')
        'language': str,       # Language code
        'base': str,           # Base URI
        'value': str,          # Title content
    },
    'link': str,               # Main feed/site URL
    'links': [                 # All feed links
        {
            'rel': str,        # Relationship ('alternate', 'self', etc.)
            'type': str,       # MIME type
            'href': str,       # URL
            'title': str,      # Link title (optional)
        }
    ],
    'id': str,                 # Unique feed identifier
    'description': str,        # Feed description (RSS)
    'subtitle': str,           # Feed subtitle (Atom)
    'subtitle_detail': dict,   # Detailed subtitle information
    'language': str,           # Feed language code
}

Authorship and Publication

feed = {
    'author': str,             # Primary author name
    'author_detail': {         # Detailed author information
        'name': str,           # Author name
        'email': str,          # Author email
        'href': str,           # Author URL
    },
    'contributors': [          # List of contributor objects
        {
            'name': str,
            'email': str, 
            'href': str,
        }
    ],
    'publisher': str,          # Publisher name
    'publisher_detail': {      # Detailed publisher information
        'name': str,
        'email': str,
        'href': str,
    },
    'generator': str,          # Feed generator software
    'generator_detail': {      # Detailed generator information
        'name': str,
        'version': str,
        'href': str,
    },
}

Dates and Updates

feed = {
    'updated': str,            # Last updated timestamp (string)
    'updated_parsed': tuple,   # Parsed time as 9-tuple in GMT
    'published': str,          # Publication timestamp (string)  
    'published_parsed': tuple, # Parsed publication time as 9-tuple
}

Rights and Legal

feed = {
    'rights': str,             # Copyright/rights statement
    'rights_detail': {         # Detailed rights information
        'type': str,
        'language': str,
        'base': str,
        'value': str,
    },
}

Visual Elements

feed = {
    'image': {                 # Feed image/logo (RSS)
        'title': str,          # Image title
        'url': str,            # Image URL
        'link': str,           # Image link target
        'width': int,          # Image width
        'height': int,         # Image height
        'description': str,    # Image description
    },
    'icon': str,               # Feed icon URL (Atom)
    'logo': str,               # Feed logo URL (Atom)
}

RSS-Specific Elements

feed = {
    'ttl': int,                # Time-to-live (cache duration in minutes)
    'cloud': {                 # RSS cloud notification
        'domain': str,
        'port': int,
        'path': str,
        'registerprocedure': str,
        'protocol': str,
    },
    'textinput': {             # RSS text input box
        'title': str,
        'description': str,
        'name': str,
        'link': str,
    },
    'docs': str,               # Documentation URL
}

Categories and Tags

feed = {
    'tags': [                  # List of categories/tags
        {
            'term': str,       # Category term
            'scheme': str,     # Category scheme/domain
            'label': str,      # Human-readable label
        }
    ],
}

Entry-Level Structure (result.entries[n])

Each entry/item in the feed contains detailed article information:

Identity and Content

entry = {
    'title': str,              # Entry title
    'title_detail': dict,      # Detailed title information
    'link': str,               # Main entry URL
    'links': list,             # All entry links
    'id': str,                 # Unique entry identifier
    'summary': str,            # Entry summary/description
    'summary_detail': dict,    # Detailed summary information
    'content': [               # Entry content blocks
        {
            'type': str,       # Content type ('text', 'html', 'xhtml')
            'language': str,   # Content language
            'base': str,       # Base URI
            'value': str,      # Content text
        }
    ],
}

Authorship

entry = {
    'author': str,             # Primary author name
    'author_detail': dict,     # Detailed author information
    'contributors': list,      # List of contributor objects
    'publisher': str,          # Publisher name
    'publisher_detail': dict,  # Detailed publisher information
}

Dates

entry = {
    'updated': str,            # Last updated timestamp
    'updated_parsed': tuple,   # Parsed updated time as 9-tuple
    'published': str,          # Publication timestamp
    'published_parsed': tuple, # Parsed publication time as 9-tuple
    'created': str,            # Creation timestamp (rare)
    'created_parsed': tuple,   # Parsed creation time as 9-tuple
    'expired': str,            # Expiration timestamp (rare)
    'expired_parsed': tuple,   # Parsed expiration time as 9-tuple
}

Media and Attachments

entry = {
    'enclosures': [            # Attached files (podcasts, etc.)
        {
            'href': str,       # File URL
            'type': str,       # MIME type
            'length': str,     # File size in bytes
        }
    ],
}

Categories and Classification

entry = {
    'tags': [                  # Entry categories/tags
        {
            'term': str,       # Tag term
            'scheme': str,     # Tag scheme/domain
            'label': str,      # Human-readable label
        }
    ],
}

Comments and Interaction

entry = {
    'comments': str,           # Comments URL
    'license': str,            # Content license URL
}

Source Attribution

entry = {
    'source': {                # Original source information
        'title': str,          # Source feed title
        'href': str,           # Source feed URL
        'value': str,          # Source description
    },
}

Usage Examples

Basic Data Access

result = feedparser.parse(url)

# Feed information
print(f"Feed: {result.feed.title}")
print(f"Description: {result.feed.description}")
print(f"Last updated: {result.feed.updated}")

# Entry information
for entry in result.entries:
    print(f"Title: {entry.title}")
    print(f"Link: {entry.link}")
    print(f"Published: {entry.published}")
    print(f"Summary: {entry.summary}")

Attribute vs Dictionary Access

# Both styles work identically
title1 = result.feed.title
title2 = result.feed['title']
title3 = result['feed']['title']

# All three methods return the same value
assert title1 == title2 == title3

Legacy Key Compatibility

# Legacy RSS keys automatically map to modern equivalents
description = result.feed.description  # RSS 'description'
subtitle = result.feed.subtitle        # Atom 'subtitle'  
# Both may return the same content depending on feed format

# Legacy item access
items = result.items    # Maps to result.entries
guid = entry.guid       # Maps to entry.id

Content Type Handling

# Check content types for proper rendering
if entry.title_detail.type == 'html':
    # Contains HTML markup
    html_title = entry.title
elif entry.title_detail.type == 'text':
    # Plain text only
    text_title = entry.title

# Handle multiple content blocks
for content_block in entry.content:
    if content_block.type == 'html':
        html_content = content_block.value
    elif content_block.type == 'text':
        text_content = content_block.value

Safe Content Access

# Use .get() for optional fields
author = entry.get('author', 'Unknown')
published = entry.get('published', 'Date not available')

# Check for field existence
if 'enclosures' in entry:
    for enclosure in entry.enclosures:
        print(f"Attachment: {enclosure.href}")

# Handle missing nested fields
if hasattr(entry, 'author_detail') and entry.author_detail:
    email = entry.author_detail.get('email', 'No email')

Install with Tessl CLI

npx tessl i tessl/pypi-feedparser

docs