tessl/pypi-feedparser

Universal feed parser for RSS, Atom, and CDF feeds with comprehensive format support and robust parsing capabilities

—

Pending

Overview

Eval results

Files

Error Handling

Name: tessl/pypi-feedparser
Author: tessl

Feedparser provides comprehensive error handling with graceful degradation, allowing feed parsing to continue even when encountering malformed content, encoding issues, or network problems.

Capabilities

Exception Classes

Feedparser defines several exception classes for different types of parsing issues:

class ThingsNobodyCaresAboutButMe(Exception):
    """
    Base exception for minor parsing issues that don't prevent feed processing.
    These exceptions are captured in bozo_exception but don't stop parsing.
    """

class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe):
    """
    Raised when character encoding is overridden during parsing.
    Indicates encoding was detected/specified differently than declared.
    """

class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe):
    """
    Raised when character encoding cannot be determined.
    Parser will fall back to default encoding handling.
    """

class NonXMLContentType(ThingsNobodyCaresAboutButMe):
    """
    Raised when content type is not XML but parsing continues anyway.
    Common when servers misconfigure content-type headers.
    """

class UndeclaredNamespace(Exception):
    """
    Raised when XML contains undeclared namespaces.
    More serious than other exceptions but parsing may still continue.
    """

Bozo Detection System

Feedparser uses a "bozo" flag system to indicate when feeds have issues while still attempting to parse them:

Bozo Flag

result = feedparser.parse(url)

# Check if feed had issues
if result.bozo:
    print("Feed had parsing issues")
    print(f"Exception: {result.bozo_exception}")
    print(f"Exception type: {type(result.bozo_exception).__name__}")
else:
    print("Feed parsed cleanly")

# Feed data may still be available even with bozo=True
print(f"Found {len(result.entries)} entries")

Common Bozo Scenarios

import urllib.error
import xml.sax

result = feedparser.parse(problematic_url)

if result.bozo:
    exception = result.bozo_exception
    
    # Network/HTTP errors
    if isinstance(exception, urllib.error.HTTPError):
        print(f"HTTP {exception.code}: {exception.reason}")
    elif isinstance(exception, urllib.error.URLError):
        print(f"Network error: {exception.reason}")
    
    # XML parsing errors
    elif isinstance(exception, xml.sax.SAXException):
        print(f"XML parsing error: {exception}")
    
    # Feedparser-specific issues
    elif isinstance(exception, feedparser.CharacterEncodingUnknown):
        print("Could not determine character encoding")
    elif isinstance(exception, feedparser.NonXMLContentType):
        print("Content type is not XML")
    elif isinstance(exception, feedparser.UndeclaredNamespace):
        print("Feed contains undeclared XML namespaces")
    
    # Generic issues
    else:
        print(f"Other parsing issue: {exception}")

Error Recovery Strategies

Graceful Degradation

Feedparser attempts to extract as much data as possible even from problematic feeds:

result = feedparser.parse(malformed_feed_url)

# Always check what data was successfully extracted
print(f"Bozo: {result.bozo}")
print(f"Entries found: {len(result.entries)}")
print(f"Feed title: {result.feed.get('title', 'No title')}")

# Process available data despite errors
if result.entries:
    print("Processing entries despite parsing issues:")
    for entry in result.entries:
        title = entry.get('title', 'No title')
        link = entry.get('link', 'No link')
        print(f"  - {title}: {link}")

Parser Fallback System

Feedparser automatically falls back from strict XML parsing to loose HTML-style parsing:

result = feedparser.parse(questionable_feed)

# Check which parser was used
if result.bozo:
    # Likely used loose parser due to malformed XML
    print("Used tolerant parser for malformed content")
else:
    # Used strict XML parser
    print("Used strict XML parser")

# Both parsers can produce valid results
if result.version:
    print(f"Detected format: {result.version}")

Error Handling Patterns

Safe Data Access

Always use safe access patterns when dealing with potentially problematic feeds:

result = feedparser.parse(url)

# Safe feed-level access
feed_title = result.feed.get('title', 'Untitled Feed')
feed_link = result.feed.get('link', '')
feed_description = result.feed.get('description', 'No description')

# Safe entry-level access  
for entry in result.entries:
    title = entry.get('title', 'Untitled')
    link = entry.get('link', '#')
    summary = entry.get('summary', 'No summary')
    
    # Check for date parsing success
    if entry.get('published_parsed'):
        import time
        pub_date = time.strftime('%Y-%m-%d', entry.published_parsed)
    else:
        pub_date = entry.get('published', 'Unknown date')
    
    print(f"{title} ({pub_date}): {summary[:100]}...")

Exception Handling Wrapper

import feedparser
import logging

def safe_parse_feed(url, max_retries=3):
    """
    Safely parse a feed with error handling and retries.
    """
    for attempt in range(max_retries):
        try:
            result = feedparser.parse(url)
            
            # Log parsing issues but continue
            if result.bozo:
                logging.warning(f"Feed parsing issues for {url}: {result.bozo_exception}")
            
            # Validate minimum data requirements
            if not result.feed and not result.entries:
                logging.error(f"No usable data found in feed: {url}")
                if attempt < max_retries - 1:
                    continue
                return None
            
            return result
            
        except Exception as e:
            logging.error(f"Unexpected error parsing {url} (attempt {attempt + 1}): {e}")
            if attempt < max_retries - 1:
                import time
                time.sleep(2 ** attempt)  # Exponential backoff
            
    return None

# Usage
result = safe_parse_feed('https://example.com/problematic-feed.xml')
if result:
    print(f"Successfully parsed feed with {len(result.entries)} entries")
else:
    print("Failed to parse feed after all retries")

Content Validation

def validate_feed_content(result):
    """
    Validate and report on feed content quality.
    """
    issues = []
    
    # Check parsing status
    if result.bozo:
        issues.append(f"Parsing issues: {result.bozo_exception}")
    
    # Check feed-level data
    if not result.feed.get('title'):
        issues.append("Feed has no title")
    
    if not result.feed.get('link'):
        issues.append("Feed has no link")
    
    # Check entry quality
    if not result.entries:
        issues.append("Feed has no entries")
    else:
        entries_without_titles = sum(1 for e in result.entries if not e.get('title'))
        if entries_without_titles:
            issues.append(f"{entries_without_titles} entries missing titles")
        
        entries_without_links = sum(1 for e in result.entries if not e.get('link'))
        if entries_without_links:
            issues.append(f"{entries_without_links} entries missing links")
    
    return issues

# Usage
result = feedparser.parse(url)
issues = validate_feed_content(result)

if issues:
    print("Feed quality issues:")
    for issue in issues:
        print(f"  - {issue}")
else:
    print("Feed appears to be high quality")

Character Encoding Issues

Encoding Detection and Override

result = feedparser.parse(url)

# Check encoding information
print(f"Detected encoding: {result.get('encoding', 'Unknown')}")

# Handle encoding-related exceptions
if result.bozo:
    if isinstance(result.bozo_exception, feedparser.CharacterEncodingOverride):
        print("Encoding was overridden during parsing")
    elif isinstance(result.bozo_exception, feedparser.CharacterEncodingUnknown):
        print("Could not determine character encoding")

# Content should still be usable with UTF-8 conversion
for entry in result.entries:
    # Text content is normalized to Unicode
    title = entry.get('title', '')
    if title:
        print(f"Entry title: {title}")

Handling Malformed Unicode

def clean_text_content(text):
    """
    Clean text content that may have encoding issues.
    """
    if not text:
        return text
    
    # Remove or replace problematic characters
    import unicodedata
    
    # Normalize Unicode
    text = unicodedata.normalize('NFKC', text)
    
    # Remove control characters except whitespace
    text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char.isspace())
    
    return text.strip()

# Apply to feed content
result = feedparser.parse(url)
for entry in result.entries:
    title = clean_text_content(entry.get('title', ''))
    summary = clean_text_content(entry.get('summary', ''))
    # Use cleaned content...

Network Error Handling

HTTP Error Recovery

import urllib.error

def parse_with_fallbacks(urls):
    """
    Try multiple feed URLs with fallback handling.
    """
    for url in urls:
        try:
            result = feedparser.parse(url)
            
            # Check HTTP status
            if hasattr(result, 'status'):
                if result.status == 200:
                    return result
                elif result.status == 301 or result.status == 302:
                    # Follow redirect manually if needed
                    redirect_url = result.headers.get('location')
                    if redirect_url:
                        return feedparser.parse(redirect_url)
                elif result.status == 304:
                    # Not modified - could return cached version
                    pass
                elif result.status >= 400:
                    print(f"HTTP {result.status} for {url}")
                    continue
            
            # Return result even with minor issues
            if result.entries or result.feed:
                return result
                
        except Exception as e:
            print(f"Failed to parse {url}: {e}")
            continue
    
    return None

# Usage with multiple possible feed URLs
feed_urls = [
    'https://example.com/feed.xml',
    'https://example.com/rss.xml', 
    'https://example.com/atom.xml',
    'https://example.com/feeds/all.xml'
]

result = parse_with_fallbacks(feed_urls)

Timeout and Connection Issues

import socket
import urllib.error

# Set reasonable timeout
socket.setdefaulttimeout(30)

def parse_with_timeout_handling(url):
    """
    Parse feed with proper timeout and connection error handling.
    """
    try:
        result = feedparser.parse(url)
        return result
        
    except socket.timeout:
        print(f"Timeout accessing {url}")
        return None
        
    except socket.gaierror as e:
        print(f"DNS resolution failed for {url}: {e}")
        return None
        
    except ConnectionResetError:
        print(f"Connection reset by peer: {url}")
        return None
        
    except urllib.error.URLError as e:
        if isinstance(e.reason, socket.timeout):
            print(f"Timeout: {url}")
        else:
            print(f"URL error for {url}: {e.reason}")
        return None

Logging and Monitoring

Comprehensive Error Logging

import logging
import feedparser

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def parse_and_log(url):
    """
    Parse feed with comprehensive logging.
    """
    logger.info(f"Parsing feed: {url}")
    
    result = feedparser.parse(url)
    
    # Log parsing results
    if result.bozo:
        logger.warning(f"Bozo feed {url}: {result.bozo_exception}")
    else:
        logger.info(f"Clean parse: {url}")
    
    # Log HTTP information
    if hasattr(result, 'status'):
        logger.info(f"HTTP {result.status}: {url}")
    
    # Log content statistics
    logger.info(f"Feed: {len(result.entries)} entries, version: {result.get('version', 'unknown')}")
    
    # Log encoding information
    if 'encoding' in result:
        logger.info(f"Encoding: {result.encoding}")
    
    return result

Install with Tessl CLI