tessl/pypi-feedparser

Universal feed parser for RSS, Atom, and CDF feeds with comprehensive format support and robust parsing capabilities

—

Pending

Overview

Eval results

Files

HTTP Features

Name: tessl/pypi-feedparser
Author: tessl

Feedparser provides comprehensive HTTP client capabilities for fetching feeds from URLs, including conditional requests, custom headers, authentication support, and redirect handling.

Capabilities

Global Configuration Constants

Configure default HTTP behavior for all parsing operations.

USER_AGENT: str = "feedparser/{version} +https://github.com/kurtmckee/feedparser/"
# Default HTTP User-Agent header sent with requests

RESOLVE_RELATIVE_URIS: int = 1  
# Global setting: resolve relative URIs to absolute (1=enabled, 0=disabled)

SANITIZE_HTML: int = 1
# Global setting: sanitize HTML content (1=enabled, 0=disabled)

HTTP Response Information

When parsing from URLs, the result contains comprehensive HTTP response data:

# HTTP response fields in result
result = {
    'status': int,        # HTTP status code (200, 304, 404, etc.)
    'headers': dict,      # All HTTP response headers
    'etag': str,         # HTTP ETag header for caching
    'modified': str,     # HTTP Last-Modified header  
    'href': str,         # Final URL after redirects
}

HTTP Client Features

User-Agent Configuration

Set custom User-Agent strings for identification:

import feedparser

# Set global User-Agent for all requests
feedparser.USER_AGENT = 'MyFeedReader/1.0 (+https://example.com/bot.html)'

# Or specify per-request
result = feedparser.parse(
    url, 
    agent='MyBot/2.0 (contact@example.com)'
)

Custom Request Headers

Add custom HTTP headers to requests:

# Add authorization
result = feedparser.parse(
    url,
    request_headers={
        'Authorization': 'Bearer your-token-here',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
    }
)

# Override default headers
result = feedparser.parse(
    url,
    request_headers={
        'User-Agent': 'CustomBot/1.0',  # Overrides agent parameter
        'Referer': 'https://example.com',  # Custom referer
    }
)

Conditional Requests (Caching)

Use ETags and Last-Modified headers for efficient feed polling:

# Initial request - save caching headers
result = feedparser.parse('https://example.com/feed.xml')

# Store caching information
etag = result.get('etag')
modified = result.get('modified')

# Subsequent conditional request  
result = feedparser.parse(
    'https://example.com/feed.xml',
    etag=etag,
    modified=modified
)

# Check if content was modified
if result.status == 304:
    print("Feed not modified - use cached version")
else:
    print(f"Feed updated - {len(result.entries)} entries")

HTTP Authentication

Feedparser supports various authentication methods through custom handlers:

import urllib.request
import feedparser

# Basic authentication
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'https://example.com/', 'username', 'password')

auth_handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

result = feedparser.parse(
    'https://example.com/protected-feed.xml',
    handlers=[auth_handler]
)

# Digest authentication
digest_handler = urllib.request.HTTPDigestAuthHandler(password_mgr)

result = feedparser.parse(
    url,
    handlers=[digest_handler] 
)

Proxy Support

Configure proxy settings using urllib handlers:

import urllib.request
import feedparser

# HTTP proxy
proxy_handler = urllib.request.ProxyHandler({
    'http': 'http://proxy.example.com:8080',
    'https': 'https://proxy.example.com:8080'
})

result = feedparser.parse(
    url,
    handlers=[proxy_handler]
)

# Authenticated proxy
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
proxy_auth_handler.add_password('realm', 'proxy.example.com', 'username', 'password')

result = feedparser.parse(
    url, 
    handlers=[proxy_handler, proxy_auth_handler]
)

Custom URL Handlers

Extend feedparser with custom protocol handlers:

import urllib.request
import feedparser

class CustomHTTPHandler(urllib.request.HTTPHandler):
    def http_open(self, req):
        # Custom HTTP handling logic
        print(f"Fetching: {req.get_full_url()}")
        return super().http_open(req)

custom_handler = CustomHTTPHandler()

result = feedparser.parse(
    url,
    handlers=[custom_handler]
)

SSL/TLS Configuration

Configure SSL settings for HTTPS requests:

import ssl
import urllib.request
import feedparser

# Create SSL context with custom settings
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False  # Disable hostname verification
ssl_context.verify_mode = ssl.CERT_NONE  # Disable certificate verification

# Create HTTPS handler with custom context
https_handler = urllib.request.HTTPSHandler(context=ssl_context)

result = feedparser.parse(
    'https://example.com/feed.xml',
    handlers=[https_handler]
)

Redirect Handling

Feedparser automatically follows redirects and provides final URL:

result = feedparser.parse('https://example.com/redirect-to-feed')

# Check if redirects occurred
original_url = 'https://example.com/redirect-to-feed'
final_url = result.get('href', '')

if final_url and final_url != original_url:
    print(f"Redirected from {original_url} to {final_url}")

# Access redirect history through headers
if 'location' in result.headers:
    print(f"Redirect location: {result.headers['location']}")

Response Header Handling

Accessing Response Headers

result = feedparser.parse(url)

# Access all headers
headers = result.headers
print(f"Content-Type: {headers.get('content-type')}")
print(f"Content-Length: {headers.get('content-length')}")
print(f"Server: {headers.get('server')}")

# Check for specific caching headers
if 'etag' in headers:
    print(f"ETag: {headers['etag']}")
    
if 'last-modified' in headers:
    print(f"Last-Modified: {headers['last-modified']}")

# Check content encoding
if 'content-encoding' in headers:
    print(f"Compression: {headers['content-encoding']}")

Overriding Response Headers

Useful for testing or when parsing content without HTTP:

# Override/supplement response headers
result = feedparser.parse(
    content_string,
    response_headers={
        'content-type': 'application/rss+xml; charset=utf-8',
        'content-location': 'https://example.com/feed.xml',
        'last-modified': 'Mon, 06 Sep 2021 12:00:00 GMT',
        'etag': '"abc123"'
    }
)

# Headers affect base URI resolution and caching behavior
print(f"Base URI: {result.href}")

Error Handling

HTTP Status Codes

result = feedparser.parse(url)

# Check HTTP status
status = result.get('status', 0)

if status == 200:
    print("Feed fetched successfully")
elif status == 304:
    print("Feed not modified (cached version is current)")
elif status == 404:
    print("Feed not found")
elif status == 403:
    print("Access forbidden") 
elif status >= 500:
    print(f"Server error: {status}")
elif status >= 400:
    print(f"Client error: {status}")
else:
    print(f"Unexpected status: {status}")

# Process feed data regardless of minor HTTP issues
if result.entries:
    print(f"Found {len(result.entries)} entries despite HTTP status {status}")

Network Error Handling

import urllib.error
import feedparser

try:
    result = feedparser.parse(url)
    
    # Check for network-related bozo exceptions  
    if result.bozo and isinstance(result.bozo_exception, urllib.error.URLError):
        print(f"Network error: {result.bozo_exception}")
        
        # Specific error types
        if isinstance(result.bozo_exception, urllib.error.HTTPError):
            print(f"HTTP Error {result.bozo_exception.code}: {result.bozo_exception.reason}")
        else:
            print(f"URL Error: {result.bozo_exception.reason}")
    
    # Process any data that was retrieved
    if result.entries:
        print("Some data was retrieved despite errors")
        
except Exception as e:
    print(f"Unexpected error: {e}")

Timeout Configuration

import socket
import urllib.request
import feedparser

# Set global socket timeout
socket.setdefaulttimeout(30)  # 30 seconds

# Or create custom opener with timeout
opener = urllib.request.build_opener()

result = feedparser.parse(
    url,
    handlers=[opener]
)

Content-Type Handling

Feedparser handles various content types gracefully:

result = feedparser.parse(url)

# Check detected content type
content_type = result.headers.get('content-type', '')

if 'xml' in content_type.lower():
    print("XML content detected")
elif 'html' in content_type.lower():
    print("HTML content - may use loose parser")
    
# Check for non-XML content type exception
if result.bozo and isinstance(result.bozo_exception, feedparser.NonXMLContentType):
    print(f"Non-XML content type: {content_type}")
    # Feedparser will still attempt to parse

Compression Support

Feedparser automatically handles compressed responses:

# Automatic gzip/deflate decompression
result = feedparser.parse(url)

# Check if content was compressed
content_encoding = result.headers.get('content-encoding', '')
if content_encoding:
    print(f"Content was compressed with: {content_encoding}")

# Request specific compression
result = feedparser.parse(
    url,
    request_headers={
        'Accept-Encoding': 'gzip, deflate, br'
    }
)

Global Configuration Examples

import feedparser

# Configure global defaults
feedparser.USER_AGENT = 'MyFeedAggregator/1.0 (+https://example.com)'
feedparser.RESOLVE_RELATIVE_URIS = 1  # Enable URI resolution
feedparser.SANITIZE_HTML = 1  # Enable HTML sanitization

# All subsequent parse() calls use these defaults
result1 = feedparser.parse(url1)  
result2 = feedparser.parse(url2)

# Override global settings per-request
result3 = feedparser.parse(
    url3,
    agent='SpecialBot/2.0',  # Override global USER_AGENT
    sanitize_html=False      # Override global SANITIZE_HTML
)

Install with Tessl CLI