Universal feed parser for RSS, Atom, and CDF feeds with comprehensive format support and robust parsing capabilities
—
Feedparser provides comprehensive error handling with graceful degradation, allowing feed parsing to continue even when encountering malformed content, encoding issues, or network problems.
Feedparser defines several exception classes for different types of parsing issues:
class ThingsNobodyCaresAboutButMe(Exception):
"""
Base exception for minor parsing issues that don't prevent feed processing.
These exceptions are captured in bozo_exception but don't stop parsing.
"""
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe):
"""
Raised when character encoding is overridden during parsing.
Indicates encoding was detected/specified differently than declared.
"""
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe):
"""
Raised when character encoding cannot be determined.
Parser will fall back to default encoding handling.
"""
class NonXMLContentType(ThingsNobodyCaresAboutButMe):
"""
Raised when content type is not XML but parsing continues anyway.
Common when servers misconfigure content-type headers.
"""
class UndeclaredNamespace(Exception):
"""
Raised when XML contains undeclared namespaces.
More serious than other exceptions but parsing may still continue.
"""Feedparser uses a "bozo" flag system to indicate when feeds have issues while still attempting to parse them:
result = feedparser.parse(url)
# Check if feed had issues
if result.bozo:
print("Feed had parsing issues")
print(f"Exception: {result.bozo_exception}")
print(f"Exception type: {type(result.bozo_exception).__name__}")
else:
print("Feed parsed cleanly")
# Feed data may still be available even with bozo=True
print(f"Found {len(result.entries)} entries")import urllib.error
import xml.sax
result = feedparser.parse(problematic_url)
if result.bozo:
exception = result.bozo_exception
# Network/HTTP errors
if isinstance(exception, urllib.error.HTTPError):
print(f"HTTP {exception.code}: {exception.reason}")
elif isinstance(exception, urllib.error.URLError):
print(f"Network error: {exception.reason}")
# XML parsing errors
elif isinstance(exception, xml.sax.SAXException):
print(f"XML parsing error: {exception}")
# Feedparser-specific issues
elif isinstance(exception, feedparser.CharacterEncodingUnknown):
print("Could not determine character encoding")
elif isinstance(exception, feedparser.NonXMLContentType):
print("Content type is not XML")
elif isinstance(exception, feedparser.UndeclaredNamespace):
print("Feed contains undeclared XML namespaces")
# Generic issues
else:
print(f"Other parsing issue: {exception}")Feedparser attempts to extract as much data as possible even from problematic feeds:
result = feedparser.parse(malformed_feed_url)
# Always check what data was successfully extracted
print(f"Bozo: {result.bozo}")
print(f"Entries found: {len(result.entries)}")
print(f"Feed title: {result.feed.get('title', 'No title')}")
# Process available data despite errors
if result.entries:
print("Processing entries despite parsing issues:")
for entry in result.entries:
title = entry.get('title', 'No title')
link = entry.get('link', 'No link')
print(f" - {title}: {link}")Feedparser automatically falls back from strict XML parsing to loose HTML-style parsing:
result = feedparser.parse(questionable_feed)
# Check which parser was used
if result.bozo:
# Likely used loose parser due to malformed XML
print("Used tolerant parser for malformed content")
else:
# Used strict XML parser
print("Used strict XML parser")
# Both parsers can produce valid results
if result.version:
print(f"Detected format: {result.version}")Always use safe access patterns when dealing with potentially problematic feeds:
result = feedparser.parse(url)
# Safe feed-level access
feed_title = result.feed.get('title', 'Untitled Feed')
feed_link = result.feed.get('link', '')
feed_description = result.feed.get('description', 'No description')
# Safe entry-level access
for entry in result.entries:
title = entry.get('title', 'Untitled')
link = entry.get('link', '#')
summary = entry.get('summary', 'No summary')
# Check for date parsing success
if entry.get('published_parsed'):
import time
pub_date = time.strftime('%Y-%m-%d', entry.published_parsed)
else:
pub_date = entry.get('published', 'Unknown date')
print(f"{title} ({pub_date}): {summary[:100]}...")import feedparser
import logging
def safe_parse_feed(url, max_retries=3):
"""
Safely parse a feed with error handling and retries.
"""
for attempt in range(max_retries):
try:
result = feedparser.parse(url)
# Log parsing issues but continue
if result.bozo:
logging.warning(f"Feed parsing issues for {url}: {result.bozo_exception}")
# Validate minimum data requirements
if not result.feed and not result.entries:
logging.error(f"No usable data found in feed: {url}")
if attempt < max_retries - 1:
continue
return None
return result
except Exception as e:
logging.error(f"Unexpected error parsing {url} (attempt {attempt + 1}): {e}")
if attempt < max_retries - 1:
import time
time.sleep(2 ** attempt) # Exponential backoff
return None
# Usage
result = safe_parse_feed('https://example.com/problematic-feed.xml')
if result:
print(f"Successfully parsed feed with {len(result.entries)} entries")
else:
print("Failed to parse feed after all retries")def validate_feed_content(result):
"""
Validate and report on feed content quality.
"""
issues = []
# Check parsing status
if result.bozo:
issues.append(f"Parsing issues: {result.bozo_exception}")
# Check feed-level data
if not result.feed.get('title'):
issues.append("Feed has no title")
if not result.feed.get('link'):
issues.append("Feed has no link")
# Check entry quality
if not result.entries:
issues.append("Feed has no entries")
else:
entries_without_titles = sum(1 for e in result.entries if not e.get('title'))
if entries_without_titles:
issues.append(f"{entries_without_titles} entries missing titles")
entries_without_links = sum(1 for e in result.entries if not e.get('link'))
if entries_without_links:
issues.append(f"{entries_without_links} entries missing links")
return issues
# Usage
result = feedparser.parse(url)
issues = validate_feed_content(result)
if issues:
print("Feed quality issues:")
for issue in issues:
print(f" - {issue}")
else:
print("Feed appears to be high quality")result = feedparser.parse(url)
# Check encoding information
print(f"Detected encoding: {result.get('encoding', 'Unknown')}")
# Handle encoding-related exceptions
if result.bozo:
if isinstance(result.bozo_exception, feedparser.CharacterEncodingOverride):
print("Encoding was overridden during parsing")
elif isinstance(result.bozo_exception, feedparser.CharacterEncodingUnknown):
print("Could not determine character encoding")
# Content should still be usable with UTF-8 conversion
for entry in result.entries:
# Text content is normalized to Unicode
title = entry.get('title', '')
if title:
print(f"Entry title: {title}")def clean_text_content(text):
"""
Clean text content that may have encoding issues.
"""
if not text:
return text
# Remove or replace problematic characters
import unicodedata
# Normalize Unicode
text = unicodedata.normalize('NFKC', text)
# Remove control characters except whitespace
text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char.isspace())
return text.strip()
# Apply to feed content
result = feedparser.parse(url)
for entry in result.entries:
title = clean_text_content(entry.get('title', ''))
summary = clean_text_content(entry.get('summary', ''))
# Use cleaned content...import urllib.error
def parse_with_fallbacks(urls):
"""
Try multiple feed URLs with fallback handling.
"""
for url in urls:
try:
result = feedparser.parse(url)
# Check HTTP status
if hasattr(result, 'status'):
if result.status == 200:
return result
elif result.status == 301 or result.status == 302:
# Follow redirect manually if needed
redirect_url = result.headers.get('location')
if redirect_url:
return feedparser.parse(redirect_url)
elif result.status == 304:
# Not modified - could return cached version
pass
elif result.status >= 400:
print(f"HTTP {result.status} for {url}")
continue
# Return result even with minor issues
if result.entries or result.feed:
return result
except Exception as e:
print(f"Failed to parse {url}: {e}")
continue
return None
# Usage with multiple possible feed URLs
feed_urls = [
'https://example.com/feed.xml',
'https://example.com/rss.xml',
'https://example.com/atom.xml',
'https://example.com/feeds/all.xml'
]
result = parse_with_fallbacks(feed_urls)import socket
import urllib.error
# Set reasonable timeout
socket.setdefaulttimeout(30)
def parse_with_timeout_handling(url):
"""
Parse feed with proper timeout and connection error handling.
"""
try:
result = feedparser.parse(url)
return result
except socket.timeout:
print(f"Timeout accessing {url}")
return None
except socket.gaierror as e:
print(f"DNS resolution failed for {url}: {e}")
return None
except ConnectionResetError:
print(f"Connection reset by peer: {url}")
return None
except urllib.error.URLError as e:
if isinstance(e.reason, socket.timeout):
print(f"Timeout: {url}")
else:
print(f"URL error for {url}: {e.reason}")
return Noneimport logging
import feedparser
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def parse_and_log(url):
"""
Parse feed with comprehensive logging.
"""
logger.info(f"Parsing feed: {url}")
result = feedparser.parse(url)
# Log parsing results
if result.bozo:
logger.warning(f"Bozo feed {url}: {result.bozo_exception}")
else:
logger.info(f"Clean parse: {url}")
# Log HTTP information
if hasattr(result, 'status'):
logger.info(f"HTTP {result.status}: {url}")
# Log content statistics
logger.info(f"Feed: {len(result.entries)} entries, version: {result.get('version', 'unknown')}")
# Log encoding information
if 'encoding' in result:
logger.info(f"Encoding: {result.encoding}")
return resultInstall with Tessl CLI
npx tessl i tessl/pypi-feedparser