Universal feed parser for RSS, Atom, and CDF feeds with comprehensive format support and robust parsing capabilities
—
Feedparser provides comprehensive HTTP client capabilities for fetching feeds from URLs, including conditional requests, custom headers, authentication support, and redirect handling.
Configure default HTTP behavior for all parsing operations.
USER_AGENT: str = "feedparser/{version} +https://github.com/kurtmckee/feedparser/"
# Default HTTP User-Agent header sent with requests
RESOLVE_RELATIVE_URIS: int = 1
# Global setting: resolve relative URIs to absolute (1=enabled, 0=disabled)
SANITIZE_HTML: int = 1
# Global setting: sanitize HTML content (1=enabled, 0=disabled)When parsing from URLs, the result contains comprehensive HTTP response data:
# HTTP response fields in result
result = {
'status': int, # HTTP status code (200, 304, 404, etc.)
'headers': dict, # All HTTP response headers
'etag': str, # HTTP ETag header for caching
'modified': str, # HTTP Last-Modified header
'href': str, # Final URL after redirects
}Set custom User-Agent strings for identification:
import feedparser
# Set global User-Agent for all requests
feedparser.USER_AGENT = 'MyFeedReader/1.0 (+https://example.com/bot.html)'
# Or specify per-request
result = feedparser.parse(
url,
agent='MyBot/2.0 (contact@example.com)'
)Add custom HTTP headers to requests:
# Add authorization
result = feedparser.parse(
url,
request_headers={
'Authorization': 'Bearer your-token-here',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate',
}
)
# Override default headers
result = feedparser.parse(
url,
request_headers={
'User-Agent': 'CustomBot/1.0', # Overrides agent parameter
'Referer': 'https://example.com', # Custom referer
}
)Use ETags and Last-Modified headers for efficient feed polling:
# Initial request - save caching headers
result = feedparser.parse('https://example.com/feed.xml')
# Store caching information
etag = result.get('etag')
modified = result.get('modified')
# Subsequent conditional request
result = feedparser.parse(
'https://example.com/feed.xml',
etag=etag,
modified=modified
)
# Check if content was modified
if result.status == 304:
print("Feed not modified - use cached version")
else:
print(f"Feed updated - {len(result.entries)} entries")Feedparser supports various authentication methods through custom handlers:
import urllib.request
import feedparser
# Basic authentication
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'https://example.com/', 'username', 'password')
auth_handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
result = feedparser.parse(
'https://example.com/protected-feed.xml',
handlers=[auth_handler]
)
# Digest authentication
digest_handler = urllib.request.HTTPDigestAuthHandler(password_mgr)
result = feedparser.parse(
url,
handlers=[digest_handler]
)Configure proxy settings using urllib handlers:
import urllib.request
import feedparser
# HTTP proxy
proxy_handler = urllib.request.ProxyHandler({
'http': 'http://proxy.example.com:8080',
'https': 'https://proxy.example.com:8080'
})
result = feedparser.parse(
url,
handlers=[proxy_handler]
)
# Authenticated proxy
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
proxy_auth_handler.add_password('realm', 'proxy.example.com', 'username', 'password')
result = feedparser.parse(
url,
handlers=[proxy_handler, proxy_auth_handler]
)Extend feedparser with custom protocol handlers:
import urllib.request
import feedparser
class CustomHTTPHandler(urllib.request.HTTPHandler):
def http_open(self, req):
# Custom HTTP handling logic
print(f"Fetching: {req.get_full_url()}")
return super().http_open(req)
custom_handler = CustomHTTPHandler()
result = feedparser.parse(
url,
handlers=[custom_handler]
)Configure SSL settings for HTTPS requests:
import ssl
import urllib.request
import feedparser
# Create SSL context with custom settings
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False # Disable hostname verification
ssl_context.verify_mode = ssl.CERT_NONE # Disable certificate verification
# Create HTTPS handler with custom context
https_handler = urllib.request.HTTPSHandler(context=ssl_context)
result = feedparser.parse(
'https://example.com/feed.xml',
handlers=[https_handler]
)Feedparser automatically follows redirects and provides final URL:
result = feedparser.parse('https://example.com/redirect-to-feed')
# Check if redirects occurred
original_url = 'https://example.com/redirect-to-feed'
final_url = result.get('href', '')
if final_url and final_url != original_url:
print(f"Redirected from {original_url} to {final_url}")
# Access redirect history through headers
if 'location' in result.headers:
print(f"Redirect location: {result.headers['location']}")result = feedparser.parse(url)
# Access all headers
headers = result.headers
print(f"Content-Type: {headers.get('content-type')}")
print(f"Content-Length: {headers.get('content-length')}")
print(f"Server: {headers.get('server')}")
# Check for specific caching headers
if 'etag' in headers:
print(f"ETag: {headers['etag']}")
if 'last-modified' in headers:
print(f"Last-Modified: {headers['last-modified']}")
# Check content encoding
if 'content-encoding' in headers:
print(f"Compression: {headers['content-encoding']}")Useful for testing or when parsing content without HTTP:
# Override/supplement response headers
result = feedparser.parse(
content_string,
response_headers={
'content-type': 'application/rss+xml; charset=utf-8',
'content-location': 'https://example.com/feed.xml',
'last-modified': 'Mon, 06 Sep 2021 12:00:00 GMT',
'etag': '"abc123"'
}
)
# Headers affect base URI resolution and caching behavior
print(f"Base URI: {result.href}")result = feedparser.parse(url)
# Check HTTP status
status = result.get('status', 0)
if status == 200:
print("Feed fetched successfully")
elif status == 304:
print("Feed not modified (cached version is current)")
elif status == 404:
print("Feed not found")
elif status == 403:
print("Access forbidden")
elif status >= 500:
print(f"Server error: {status}")
elif status >= 400:
print(f"Client error: {status}")
else:
print(f"Unexpected status: {status}")
# Process feed data regardless of minor HTTP issues
if result.entries:
print(f"Found {len(result.entries)} entries despite HTTP status {status}")import urllib.error
import feedparser
try:
result = feedparser.parse(url)
# Check for network-related bozo exceptions
if result.bozo and isinstance(result.bozo_exception, urllib.error.URLError):
print(f"Network error: {result.bozo_exception}")
# Specific error types
if isinstance(result.bozo_exception, urllib.error.HTTPError):
print(f"HTTP Error {result.bozo_exception.code}: {result.bozo_exception.reason}")
else:
print(f"URL Error: {result.bozo_exception.reason}")
# Process any data that was retrieved
if result.entries:
print("Some data was retrieved despite errors")
except Exception as e:
print(f"Unexpected error: {e}")import socket
import urllib.request
import feedparser
# Set global socket timeout
socket.setdefaulttimeout(30) # 30 seconds
# Or create custom opener with timeout
opener = urllib.request.build_opener()
result = feedparser.parse(
url,
handlers=[opener]
)Feedparser handles various content types gracefully:
result = feedparser.parse(url)
# Check detected content type
content_type = result.headers.get('content-type', '')
if 'xml' in content_type.lower():
print("XML content detected")
elif 'html' in content_type.lower():
print("HTML content - may use loose parser")
# Check for non-XML content type exception
if result.bozo and isinstance(result.bozo_exception, feedparser.NonXMLContentType):
print(f"Non-XML content type: {content_type}")
# Feedparser will still attempt to parseFeedparser automatically handles compressed responses:
# Automatic gzip/deflate decompression
result = feedparser.parse(url)
# Check if content was compressed
content_encoding = result.headers.get('content-encoding', '')
if content_encoding:
print(f"Content was compressed with: {content_encoding}")
# Request specific compression
result = feedparser.parse(
url,
request_headers={
'Accept-Encoding': 'gzip, deflate, br'
}
)import feedparser
# Configure global defaults
feedparser.USER_AGENT = 'MyFeedAggregator/1.0 (+https://example.com)'
feedparser.RESOLVE_RELATIVE_URIS = 1 # Enable URI resolution
feedparser.SANITIZE_HTML = 1 # Enable HTML sanitization
# All subsequent parse() calls use these defaults
result1 = feedparser.parse(url1)
result2 = feedparser.parse(url2)
# Override global settings per-request
result3 = feedparser.parse(
url3,
agent='SpecialBot/2.0', # Override global USER_AGENT
sanitize_html=False # Override global SANITIZE_HTML
)Install with Tessl CLI
npx tessl i tessl/pypi-feedparser