Streaming WARC (and ARC) IO library for reading and writing web archive files
Comprehensive time handling functions for web archive timestamps with support for multiple date formats, timezone handling, and conversion between various timestamp representations used in WARC files.
Complete set of functions for converting between different date and time formats commonly used in web archiving.
def iso_date_to_datetime(string, tz_aware=False):
"""
Parse ISO 8601 date string to datetime object.
Args:
string (str): ISO 8601 date string (e.g., '2021-01-01T12:00:00Z')
tz_aware (bool): Whether to return timezone-aware datetime
Returns:
datetime: Parsed datetime object
"""
def http_date_to_datetime(string, tz_aware=False):
"""
Parse HTTP date string to datetime object.
Args:
string (str): HTTP date string (e.g., 'Fri, 01 Jan 2021 12:00:00 GMT')
tz_aware (bool): Whether to return timezone-aware datetime
Returns:
datetime: Parsed datetime object
"""
def datetime_to_http_date(the_datetime):
"""
Convert datetime to HTTP date string format.
Args:
the_datetime (datetime): Datetime object to convert
Returns:
str: HTTP date string (e.g., 'Fri, 01 Jan 2021 12:00:00 GMT')
"""
def datetime_to_iso_date(the_datetime, use_micros=False):
"""
Convert datetime to ISO 8601 date string.
Args:
the_datetime (datetime): Datetime object to convert
use_micros (bool): Whether to include microseconds in output
Returns:
str: ISO 8601 date string
"""
def datetime_to_timestamp(the_datetime):
"""
Convert datetime to 14-digit timestamp format.
Args:
the_datetime (datetime): Datetime object to convert
Returns:
str: 14-digit timestamp (YYYYMMDDHHMMSS)
"""Functions for working with numeric timestamp formats used in web archiving.
def timestamp_now():
"""
Get current timestamp in 14-digit format.
Returns:
str: Current timestamp (YYYYMMDDHHMMSS)
"""
def timestamp20_now():
"""
Get current timestamp in 20-digit format with microseconds.
Returns:
str: Current timestamp (YYYYMMDDHHMMSSNNNNNN)
"""
def iso_date_to_timestamp(string):
"""
Convert ISO 8601 date string to 14-digit timestamp.
Args:
string (str): ISO 8601 date string
Returns:
str: 14-digit timestamp
"""
def timestamp_to_iso_date(string):
"""
Convert 14-digit timestamp to ISO 8601 date string.
Args:
string (str): 14-digit timestamp
Returns:
str: ISO 8601 date string
"""
def http_date_to_timestamp(string):
"""
Convert HTTP date string to 14-digit timestamp.
Args:
string (str): HTTP date string
Returns:
str: 14-digit timestamp
"""
def timestamp_to_http_date(string):
"""
Convert 14-digit timestamp to HTTP date string.
Args:
string (str): 14-digit timestamp
Returns:
str: HTTP date string
"""
def timestamp_to_datetime(string, tz_aware=False):
"""
Parse 14-digit timestamp to datetime object.
Args:
string (str): 14-digit timestamp
tz_aware (bool): Whether to return timezone-aware datetime
Returns:
datetime: Parsed datetime object
"""
def timestamp_to_sec(string):
"""
Convert 14-digit timestamp to seconds since Unix epoch.
Args:
string (str): 14-digit timestamp
Returns:
float: Seconds since Unix epoch
"""
def sec_to_timestamp(secs):
"""
Convert seconds since Unix epoch to 14-digit timestamp.
Args:
secs (float): Seconds since Unix epoch
Returns:
str: 14-digit timestamp
"""Functions for formatting and padding timestamps to specific lengths.
def pad_timestamp(string, pad_str=PAD_6_UP):
"""
Pad timestamp to specified length using padding string.
Args:
string (str): Timestamp to pad
pad_str (str): Padding string pattern to use
Returns:
str: Padded timestamp
"""from warcio.timeutils import (
iso_date_to_datetime, datetime_to_iso_date,
http_date_to_datetime, datetime_to_http_date,
datetime_to_timestamp, timestamp_to_datetime
)
from datetime import datetime
# Parse ISO date
iso_string = "2021-01-01T12:00:00Z"
dt = iso_date_to_datetime(iso_string)
print(f"Parsed datetime: {dt}")
# Convert back to ISO
iso_back = datetime_to_iso_date(dt)
print(f"Back to ISO: {iso_back}")
# Parse HTTP date
http_string = "Fri, 01 Jan 2021 12:00:00 GMT"
dt_http = http_date_to_datetime(http_string)
print(f"HTTP datetime: {dt_http}")
# Convert to HTTP format
http_back = datetime_to_http_date(dt_http)
print(f"Back to HTTP: {http_back}")
# Convert to 14-digit timestamp
timestamp = datetime_to_timestamp(dt)
print(f"14-digit timestamp: {timestamp}")
# Parse timestamp back to datetime
dt_from_ts = timestamp_to_datetime(timestamp)
print(f"From timestamp: {dt_from_ts}")from warcio.timeutils import timestamp_now, timestamp20_now
# Get current timestamp in different formats
current_14 = timestamp_now()
current_20 = timestamp20_now()
print(f"Current 14-digit: {current_14}")
print(f"Current 20-digit: {current_20}")
# Example outputs:
# Current 14-digit: 20210101120000
# Current 20-digit: 20210101120000123456from warcio.timeutils import (
iso_date_to_timestamp, timestamp_to_iso_date,
http_date_to_timestamp, timestamp_to_http_date
)
# Direct conversions without intermediate datetime objects
iso_date = "2021-01-01T12:00:00Z"
timestamp = iso_date_to_timestamp(iso_date)
print(f"ISO to timestamp: {iso_date} -> {timestamp}")
# Convert timestamp back to ISO
iso_back = timestamp_to_iso_date(timestamp)
print(f"Timestamp to ISO: {timestamp} -> {iso_back}")
# HTTP date to timestamp
http_date = "Fri, 01 Jan 2021 12:00:00 GMT"
timestamp_from_http = http_date_to_timestamp(http_date)
print(f"HTTP to timestamp: {http_date} -> {timestamp_from_http}")
# Timestamp to HTTP date
http_back = timestamp_to_http_date(timestamp_from_http)
print(f"Timestamp to HTTP: {timestamp_from_http} -> {http_back}")from warcio.timeutils import timestamp_to_sec, sec_to_timestamp
import time
# Convert 14-digit timestamp to Unix seconds
timestamp = "20210101120000"
unix_seconds = timestamp_to_sec(timestamp)
print(f"Timestamp to seconds: {timestamp} -> {unix_seconds}")
# Convert Unix seconds back to timestamp
timestamp_back = sec_to_timestamp(unix_seconds)
print(f"Seconds to timestamp: {unix_seconds} -> {timestamp_back}")
# Work with current Unix time
current_time = time.time()
current_timestamp = sec_to_timestamp(current_time)
print(f"Current Unix time: {current_time}")
print(f"As timestamp: {current_timestamp}")from warcio.timeutils import (
iso_date_to_datetime, http_date_to_datetime,
timestamp_to_datetime
)
# Parse with timezone awareness
iso_with_tz = "2021-01-01T12:00:00+05:00"
dt_tz_aware = iso_date_to_datetime(iso_with_tz, tz_aware=True)
print(f"Timezone-aware datetime: {dt_tz_aware}")
print(f"Timezone info: {dt_tz_aware.tzinfo}")
# Parse without timezone awareness (default)
dt_naive = iso_date_to_datetime(iso_with_tz, tz_aware=False)
print(f"Naive datetime: {dt_naive}")
print(f"Timezone info: {dt_naive.tzinfo}")
# Same applies to other parsing functions
http_date = "Fri, 01 Jan 2021 12:00:00 GMT"
http_tz_aware = http_date_to_datetime(http_date, tz_aware=True)
print(f"HTTP timezone-aware: {http_tz_aware}")
timestamp = "20210101120000"
ts_tz_aware = timestamp_to_datetime(timestamp, tz_aware=True)
print(f"Timestamp timezone-aware: {ts_tz_aware}")from warcio.timeutils import datetime_to_iso_date, timestamp20_now
from datetime import datetime
# Create datetime with microseconds
dt_with_micros = datetime(2021, 1, 1, 12, 0, 0, 123456)
# Convert to ISO with microseconds
iso_with_micros = datetime_to_iso_date(dt_with_micros, use_micros=True)
print(f"ISO with microseconds: {iso_with_micros}")
# Convert without microseconds (default)
iso_without_micros = datetime_to_iso_date(dt_with_micros, use_micros=False)
print(f"ISO without microseconds: {iso_without_micros}")
# 20-digit timestamp includes microseconds
timestamp_20 = timestamp20_now()
print(f"20-digit timestamp: {timestamp_20}")
print(f" Date part: {timestamp_20[:8]}")
print(f" Time part: {timestamp_20[8:14]}")
print(f" Microsec part: {timestamp_20[14:]}")from warcio.timeutils import pad_timestamp
# Example of timestamp padding (actual padding constants depend on implementation)
short_timestamp = "202101" # Partial timestamp
padded = pad_timestamp(short_timestamp)
print(f"Padded timestamp: {short_timestamp} -> {padded}")
# This function is typically used internally for normalizing timestamps
# to consistent lengths for sorting and comparisonfrom warcio.timeutils import datetime_to_iso_date, timestamp_now
from datetime import datetime
# Create WARC-Date header value (ISO format)
current_time = datetime.utcnow()
warc_date = datetime_to_iso_date(current_time)
print(f"WARC-Date: {warc_date}")
# Alternative using timestamp function
warc_date_alt = timestamp_now()
print(f"WARC-Date (timestamp): {warc_date_alt}")
# For WARC files, ISO format is preferred:
# WARC-Date: 2021-01-01T12:00:00Zfrom warcio.timeutils import iso_date_to_timestamp, timestamp_to_http_date
# Process multiple dates
dates = [
"2021-01-01T12:00:00Z",
"2021-06-15T14:30:45Z",
"2021-12-31T23:59:59Z"
]
# Convert to timestamps for sorting
timestamps = [iso_date_to_timestamp(date) for date in dates]
print("Timestamps for sorting:")
for orig, ts in zip(dates, timestamps):
print(f" {orig} -> {ts}")
# Convert timestamps to HTTP dates for headers
http_dates = [timestamp_to_http_date(ts) for ts in timestamps]
print("\nHTTP date headers:")
for ts, http in zip(timestamps, http_dates):
print(f" {ts} -> {http}")Install with Tessl CLI
npx tessl i tessl/pypi-warcio