Universal feed parser for RSS, Atom, and CDF feeds with comprehensive format support and robust parsing capabilities
—
Feedparser provides comprehensive date parsing capabilities supporting multiple date formats commonly found in RSS and Atom feeds. The system includes built-in handlers for various date formats and allows registration of custom date parsers.
Register custom date parsing functions to handle non-standard date formats.
def registerDateHandler(func):
"""
Register a date handler function.
Date handlers are tried in reverse registration order (most recently
registered first) until one successfully parses the date string.
Args:
func: Function that takes a date string and returns a 9-tuple date
in GMT, or None if unable to parse. Should handle exceptions
internally and return None rather than raising.
Example:
def my_date_handler(date_string):
try:
# Custom parsing logic here
return time.strptime(date_string, '%Y-%m-%d %H:%M:%S')
except ValueError:
return None
feedparser.registerDateHandler(my_date_handler)
"""Feedparser includes built-in support for numerous date formats commonly found in feeds:
Standard email/RSS date format:
Mon, 06 Sep 2021 12:00:00 GMT
Mon, 06 Sep 2021 12:00:00 +0000
06 Sep 2021 12:00:00 ESTStandard Atom and modern date format:
2021-09-06T12:00:00Z
2021-09-06T12:00:00+00:00
2021-09-06T12:00:00.123Z
2021-09-06T12:00:00
2021-09-06Unix/C library date format:
Mon Sep 6 12:00:00 2021Support for various localized date formats:
Korean Formats:
European Formats:
Version control system date format used by some feeds.
result = feedparser.parse(url)
# Access parsed dates as tuples
if result.feed.updated_parsed:
updated_tuple = result.feed.updated_parsed
# updated_tuple is a 9-tuple: (year, month, day, hour, minute, second, weekday, yearday, dst)
# Convert to datetime objects
import time
import datetime
if result.feed.updated_parsed:
timestamp = time.mktime(result.feed.updated_parsed)
dt = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc)
print(f"Feed updated: {dt.isoformat()}")
# Entry dates
for entry in result.entries:
if entry.published_parsed:
pub_time = time.mktime(entry.published_parsed)
dt = datetime.datetime.fromtimestamp(pub_time, tz=datetime.timezone.utc)
print(f"Published: {dt.strftime('%Y-%m-%d %H:%M:%S UTC')}")import re
import time
import feedparser
def parse_custom_date(date_string):
"""
Parse a custom date format: "DD/MM/YYYY HH:MM"
"""
if not date_string:
return None
# Match DD/MM/YYYY HH:MM format
match = re.match(r'(\d{2})/(\d{2})/(\d{4}) (\d{2}):(\d{2})', date_string)
if not match:
return None
try:
day, month, year, hour, minute = map(int, match.groups())
# Return 9-tuple in GMT
return (year, month, day, hour, minute, 0, 0, 0, 0)
except (ValueError, OverflowError):
return None
# Register the custom handler
feedparser.registerDateHandler(parse_custom_date)
# Now feeds with "DD/MM/YYYY HH:MM" dates will be parsed correctly
result = feedparser.parse(feed_with_custom_dates)import dateutil.parser
import feedparser
def parse_flexible_date(date_string):
"""
Use dateutil for flexible date parsing as a fallback.
"""
if not date_string:
return None
try:
# dateutil can parse many formats
dt = dateutil.parser.parse(date_string)
# Convert to GMT if timezone-aware
if dt.tzinfo:
dt = dt.astimezone(dateutil.tz.UTC)
# Return as 9-tuple
return dt.timetuple()
except (ValueError, TypeError, OverflowError):
return None
# Register as fallback handler (will be tried first due to LIFO order)
feedparser.registerDateHandler(parse_flexible_date)import feedparser
# Date parsing is handled automatically by feedparser.parse()
# You don't need to call date parsing functions directly
result = feedparser.parse("https://example.com/feed.xml")
for entry in result.entries:
if hasattr(entry, 'published_parsed') and entry.published_parsed:
import time
readable = time.strftime('%Y-%m-%d %H:%M:%S UTC', entry.published_parsed)
print(f"Published: {readable}")
else:
print("No handler could parse the date")Date handlers are tried in reverse registration order (LIFO - Last In, First Out):
import feedparser
def handler1(date): return None # Register first
def handler2(date): return None # Register second
def handler3(date): return None # Register third
feedparser.registerDateHandler(handler1)
feedparser.registerDateHandler(handler2)
feedparser.registerDateHandler(handler3)
# When parsing, handlers are tried in this order:
# 1. handler3 (most recently registered)
# 2. handler2
# 3. handler1 (least recently registered)
# 4. Built-in handlers (in their predefined order)Built-in date handlers are registered in this order (and thus tried in reverse):
So W3C format is tried first, OnBlog format is tried last.
Both feed-level and entry-level objects may contain these date fields:
feed = result.feed
# Publication dates
feed.published # Publication date string
feed.published_parsed # Parsed publication date tuple
# Update dates
feed.updated # Last updated date string
feed.updated_parsed # Parsed last updated date tuplefor entry in result.entries:
# Publication dates
entry.published # Publication date string
entry.published_parsed # Parsed publication date tuple
# Update dates
entry.updated # Last updated date string
entry.updated_parsed # Parsed last updated date tuple
# Creation dates (rare)
entry.created # Creation date string
entry.created_parsed # Parsed creation date tuple
# Expiration dates (rare)
entry.expired # Expiration date string
entry.expired_parsed # Parsed expiration date tupleDate parsing is designed to be fault-tolerant:
result = feedparser.parse(url)
# Always check if dates were successfully parsed
for entry in result.entries:
if entry.published_parsed:
# Date was successfully parsed
print(f"Published: {entry.published}")
else:
# Date parsing failed or no date present
print(f"No valid publication date found")
if hasattr(entry, 'published'):
print(f"Raw date string: {entry.published}")All parsed dates are normalized to GMT (UTC):
# All *_parsed dates are in GMT regardless of original timezone
if entry.published_parsed:
gmt_tuple = entry.published_parsed
# Convert to local time if needed
import time
local_timestamp = time.mktime(gmt_tuple)
local_time = time.localtime(local_timestamp)
print(f"GMT: {time.strftime('%Y-%m-%d %H:%M:%S', gmt_tuple)}")
print(f"Local: {time.strftime('%Y-%m-%d %H:%M:%S', local_time)}")FeedParserDict provides backward compatibility for legacy date field names:
# These all refer to the same data:
entry.updated # Modern name
entry.modified # Legacy RSS name
entry.date # Very old legacy name
entry.updated_parsed # Modern name
entry.modified_parsed # Legacy RSS name
entry.date_parsed # Very old legacy nameInstall with Tessl CLI
npx tessl i tessl/pypi-feedparser