CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-warcio

Streaming WARC (and ARC) IO library for reading and writing web archive files

Overview
Eval results
Files

time-utilities.mddocs/

Time Utilities

Comprehensive time handling functions for web archive timestamps with support for multiple date formats, timezone handling, and conversion between various timestamp representations used in WARC files.

Capabilities

Date and Time Conversion Functions

Complete set of functions for converting between different date and time formats commonly used in web archiving.

def iso_date_to_datetime(string, tz_aware=False):
    """
    Parse ISO 8601 date string to datetime object.
    
    Args:
        string (str): ISO 8601 date string (e.g., '2021-01-01T12:00:00Z')
        tz_aware (bool): Whether to return timezone-aware datetime
        
    Returns:
        datetime: Parsed datetime object
    """

def http_date_to_datetime(string, tz_aware=False):
    """
    Parse HTTP date string to datetime object.
    
    Args:
        string (str): HTTP date string (e.g., 'Fri, 01 Jan 2021 12:00:00 GMT')
        tz_aware (bool): Whether to return timezone-aware datetime
        
    Returns:
        datetime: Parsed datetime object
    """

def datetime_to_http_date(the_datetime):
    """
    Convert datetime to HTTP date string format.
    
    Args:
        the_datetime (datetime): Datetime object to convert
        
    Returns:
        str: HTTP date string (e.g., 'Fri, 01 Jan 2021 12:00:00 GMT')
    """

def datetime_to_iso_date(the_datetime, use_micros=False):
    """
    Convert datetime to ISO 8601 date string.
    
    Args:
        the_datetime (datetime): Datetime object to convert
        use_micros (bool): Whether to include microseconds in output
        
    Returns:
        str: ISO 8601 date string
    """

def datetime_to_timestamp(the_datetime):
    """
    Convert datetime to 14-digit timestamp format.
    
    Args:
        the_datetime (datetime): Datetime object to convert
        
    Returns:
        str: 14-digit timestamp (YYYYMMDDHHMMSS)
    """

Timestamp Functions

Functions for working with numeric timestamp formats used in web archiving.

def timestamp_now():
    """
    Get current timestamp in 14-digit format.
    
    Returns:
        str: Current timestamp (YYYYMMDDHHMMSS)
    """

def timestamp20_now():
    """
    Get current timestamp in 20-digit format with microseconds.
    
    Returns:
        str: Current timestamp (YYYYMMDDHHMMSSNNNNNN)
    """

def iso_date_to_timestamp(string):
    """
    Convert ISO 8601 date string to 14-digit timestamp.
    
    Args:
        string (str): ISO 8601 date string
        
    Returns:
        str: 14-digit timestamp
    """

def timestamp_to_iso_date(string):
    """
    Convert 14-digit timestamp to ISO 8601 date string.
    
    Args:
        string (str): 14-digit timestamp
        
    Returns:
        str: ISO 8601 date string
    """

def http_date_to_timestamp(string):
    """
    Convert HTTP date string to 14-digit timestamp.
    
    Args:
        string (str): HTTP date string
        
    Returns:
        str: 14-digit timestamp
    """

def timestamp_to_http_date(string):
    """
    Convert 14-digit timestamp to HTTP date string.
    
    Args:
        string (str): 14-digit timestamp
        
    Returns:
        str: HTTP date string
    """

def timestamp_to_datetime(string, tz_aware=False):
    """
    Parse 14-digit timestamp to datetime object.
    
    Args:
        string (str): 14-digit timestamp
        tz_aware (bool): Whether to return timezone-aware datetime
        
    Returns:
        datetime: Parsed datetime object
    """

def timestamp_to_sec(string):
    """
    Convert 14-digit timestamp to seconds since Unix epoch.
    
    Args:
        string (str): 14-digit timestamp
        
    Returns:
        float: Seconds since Unix epoch
    """

def sec_to_timestamp(secs):
    """
    Convert seconds since Unix epoch to 14-digit timestamp.
    
    Args:
        secs (float): Seconds since Unix epoch
        
    Returns:
        str: 14-digit timestamp
    """

Timestamp Formatting and Padding

Functions for formatting and padding timestamps to specific lengths.

def pad_timestamp(string, pad_str=PAD_6_UP):
    """
    Pad timestamp to specified length using padding string.
    
    Args:
        string (str): Timestamp to pad
        pad_str (str): Padding string pattern to use
        
    Returns:
        str: Padded timestamp
    """

Usage Examples

Basic Date Conversions

from warcio.timeutils import (
    iso_date_to_datetime, datetime_to_iso_date,
    http_date_to_datetime, datetime_to_http_date,
    datetime_to_timestamp, timestamp_to_datetime
)
from datetime import datetime

# Parse ISO date
iso_string = "2021-01-01T12:00:00Z"
dt = iso_date_to_datetime(iso_string)
print(f"Parsed datetime: {dt}")

# Convert back to ISO
iso_back = datetime_to_iso_date(dt)
print(f"Back to ISO: {iso_back}")

# Parse HTTP date
http_string = "Fri, 01 Jan 2021 12:00:00 GMT"
dt_http = http_date_to_datetime(http_string)
print(f"HTTP datetime: {dt_http}")

# Convert to HTTP format
http_back = datetime_to_http_date(dt_http)
print(f"Back to HTTP: {http_back}")

# Convert to 14-digit timestamp
timestamp = datetime_to_timestamp(dt)
print(f"14-digit timestamp: {timestamp}")

# Parse timestamp back to datetime
dt_from_ts = timestamp_to_datetime(timestamp)
print(f"From timestamp: {dt_from_ts}")

Current Timestamps

from warcio.timeutils import timestamp_now, timestamp20_now

# Get current timestamp in different formats
current_14 = timestamp_now()
current_20 = timestamp20_now()

print(f"Current 14-digit: {current_14}")
print(f"Current 20-digit: {current_20}")

# Example outputs:
# Current 14-digit: 20210101120000
# Current 20-digit: 20210101120000123456

Cross-Format Conversions

from warcio.timeutils import (
    iso_date_to_timestamp, timestamp_to_iso_date,
    http_date_to_timestamp, timestamp_to_http_date
)

# Direct conversions without intermediate datetime objects
iso_date = "2021-01-01T12:00:00Z"
timestamp = iso_date_to_timestamp(iso_date)
print(f"ISO to timestamp: {iso_date} -> {timestamp}")

# Convert timestamp back to ISO
iso_back = timestamp_to_iso_date(timestamp)
print(f"Timestamp to ISO: {timestamp} -> {iso_back}")

# HTTP date to timestamp
http_date = "Fri, 01 Jan 2021 12:00:00 GMT"
timestamp_from_http = http_date_to_timestamp(http_date)
print(f"HTTP to timestamp: {http_date} -> {timestamp_from_http}")

# Timestamp to HTTP date
http_back = timestamp_to_http_date(timestamp_from_http)
print(f"Timestamp to HTTP: {timestamp_from_http} -> {http_back}")

Unix Epoch Conversions

from warcio.timeutils import timestamp_to_sec, sec_to_timestamp
import time

# Convert 14-digit timestamp to Unix seconds
timestamp = "20210101120000"
unix_seconds = timestamp_to_sec(timestamp)
print(f"Timestamp to seconds: {timestamp} -> {unix_seconds}")

# Convert Unix seconds back to timestamp
timestamp_back = sec_to_timestamp(unix_seconds)
print(f"Seconds to timestamp: {unix_seconds} -> {timestamp_back}")

# Work with current Unix time
current_time = time.time()
current_timestamp = sec_to_timestamp(current_time)
print(f"Current Unix time: {current_time}")
print(f"As timestamp: {current_timestamp}")

Timezone Handling

from warcio.timeutils import (
    iso_date_to_datetime, http_date_to_datetime, 
    timestamp_to_datetime
)

# Parse with timezone awareness
iso_with_tz = "2021-01-01T12:00:00+05:00"
dt_tz_aware = iso_date_to_datetime(iso_with_tz, tz_aware=True)
print(f"Timezone-aware datetime: {dt_tz_aware}")
print(f"Timezone info: {dt_tz_aware.tzinfo}")

# Parse without timezone awareness (default)
dt_naive = iso_date_to_datetime(iso_with_tz, tz_aware=False)
print(f"Naive datetime: {dt_naive}")
print(f"Timezone info: {dt_naive.tzinfo}")

# Same applies to other parsing functions
http_date = "Fri, 01 Jan 2021 12:00:00 GMT"
http_tz_aware = http_date_to_datetime(http_date, tz_aware=True)
print(f"HTTP timezone-aware: {http_tz_aware}")

timestamp = "20210101120000"
ts_tz_aware = timestamp_to_datetime(timestamp, tz_aware=True)
print(f"Timestamp timezone-aware: {ts_tz_aware}")

Microsecond Precision

from warcio.timeutils import datetime_to_iso_date, timestamp20_now
from datetime import datetime

# Create datetime with microseconds
dt_with_micros = datetime(2021, 1, 1, 12, 0, 0, 123456)

# Convert to ISO with microseconds
iso_with_micros = datetime_to_iso_date(dt_with_micros, use_micros=True)
print(f"ISO with microseconds: {iso_with_micros}")

# Convert without microseconds (default)
iso_without_micros = datetime_to_iso_date(dt_with_micros, use_micros=False)
print(f"ISO without microseconds: {iso_without_micros}")

# 20-digit timestamp includes microseconds
timestamp_20 = timestamp20_now()
print(f"20-digit timestamp: {timestamp_20}")
print(f"  Date part: {timestamp_20[:8]}")
print(f"  Time part: {timestamp_20[8:14]}")
print(f"  Microsec part: {timestamp_20[14:]}")

Timestamp Padding

from warcio.timeutils import pad_timestamp

# Example of timestamp padding (actual padding constants depend on implementation)
short_timestamp = "202101"  # Partial timestamp
padded = pad_timestamp(short_timestamp)
print(f"Padded timestamp: {short_timestamp} -> {padded}")

# This function is typically used internally for normalizing timestamps
# to consistent lengths for sorting and comparison

WARC Date Creation

from warcio.timeutils import datetime_to_iso_date, timestamp_now
from datetime import datetime

# Create WARC-Date header value (ISO format)
current_time = datetime.utcnow()
warc_date = datetime_to_iso_date(current_time)
print(f"WARC-Date: {warc_date}")

# Alternative using timestamp function
warc_date_alt = timestamp_now()
print(f"WARC-Date (timestamp): {warc_date_alt}")

# For WARC files, ISO format is preferred:
# WARC-Date: 2021-01-01T12:00:00Z

Batch Date Processing

from warcio.timeutils import iso_date_to_timestamp, timestamp_to_http_date

# Process multiple dates
dates = [
    "2021-01-01T12:00:00Z",
    "2021-06-15T14:30:45Z", 
    "2021-12-31T23:59:59Z"
]

# Convert to timestamps for sorting
timestamps = [iso_date_to_timestamp(date) for date in dates]
print("Timestamps for sorting:")
for orig, ts in zip(dates, timestamps):
    print(f"  {orig} -> {ts}")

# Convert timestamps to HTTP dates for headers
http_dates = [timestamp_to_http_date(ts) for ts in timestamps]
print("\nHTTP date headers:")
for ts, http in zip(timestamps, http_dates):
    print(f"  {ts} -> {http}")

Install with Tessl CLI

npx tessl i tessl/pypi-warcio

docs

archive-reading.md

cli-tools.md

http-capture.md

http-headers.md

index.md

stream-processing.md

time-utilities.md

warc-writing.md

tile.json