A library for W3C Provenance Data Model supporting PROV-JSON, PROV-XML and PROV-O (RDF)
—
Comprehensive serialization support for multiple PROV formats including PROV-JSON, PROV-XML, PROV-O (RDF), and PROV-N, with automatic format detection and pluggable serializer architecture.
Abstract base class for all format-specific serializers.
class Serializer:
def __init__(self, document=None):
"""
Create a serializer for PROV documents.
Args:
document (ProvDocument, optional): Document to serialize
"""
def serialize(self, stream, **args):
"""
Abstract method for serializing a document.
Args:
stream (file-like): Stream object to serialize the document into
**args: Format-specific serialization arguments
"""
def deserialize(self, stream, **args):
"""
Abstract method for deserializing a document.
Args:
stream (file-like): Stream object to deserialize the document from
**args: Format-specific deserialization arguments
Returns:
ProvDocument: Deserialized document
"""Registry system for managing available serializers.
class Registry:
serializers: dict[str, type[Serializer]] = None
"""Dictionary mapping format names to serializer classes."""
@staticmethod
def load_serializers():
"""
Load all available serializers into the registry.
Registers serializers for:
- 'json': PROV-JSON format
- 'xml': PROV-XML format
- 'rdf': PROV-O (RDF) format
- 'provn': PROV-N format
"""
def get(format_name: str) -> type[Serializer]:
"""
Get the serializer class for the specified format.
Args:
format_name (str): Format name ('json', 'xml', 'rdf', 'provn')
Returns:
type[Serializer]: Serializer class for the format
Raises:
DoNotExist: If no serializer is available for the format
"""
class DoNotExist(Exception):
"""Exception raised when a serializer is not available for a format."""ProvDocument provides high-level serialization methods.
class ProvDocument:
def serialize(self, destination=None, format='json', **args):
"""
Serialize this document to various formats.
Args:
destination (str or file-like, optional): Output destination
format (str): Output format ('json', 'xml', 'rdf', 'provn')
**args: Format-specific arguments
Returns:
str: Serialized content if no destination specified
"""
@staticmethod
def deserialize(source, format=None, **args):
"""
Deserialize a document from various formats.
Args:
source (str or file-like): Input source
format (str, optional): Input format, auto-detected if None
**args: Format-specific arguments
Returns:
ProvDocument: Deserialized document
"""Individual serializer classes for each supported format.
class ProvJSONSerializer(Serializer):
"""
Serializer for PROV-JSON format.
PROV-JSON represents provenance as JSON objects with arrays for
different record types and attributes.
"""
class ProvXMLSerializer(Serializer):
"""
Serializer for PROV-XML format.
PROV-XML represents provenance as XML documents following the
W3C PROV-XML schema.
Requirements:
lxml>=3.3.5 (install with: pip install prov[xml])
"""
class ProvRDFSerializer(Serializer):
"""
Serializer for PROV-O (RDF) format.
PROV-O represents provenance as RDF triples using the W3C PROV
ontology vocabulary.
Requirements:
rdflib>=4.2.1,<7 (install with: pip install prov[rdf])
"""
class ProvNSerializer(Serializer):
"""
Serializer for PROV-N format.
PROV-N is the human-readable textual notation for PROV defined
by the W3C specification.
"""High-level functions for easy serialization/deserialization.
def read(source, format=None):
"""
Convenience function for reading PROV documents with automatic format detection.
Args:
source (str or PathLike or file-like): Source to read from
format (str, optional): Format hint for parsing
Returns:
ProvDocument: Loaded document or None
Raises:
TypeError: If format cannot be detected and parsing fails
"""JSON representation of PROV documents with structured objects for each record type.
# Serialize to PROV-JSON
doc.serialize('output.json', format='json')
doc.serialize('output.json', format='json', indent=2) # Pretty-printed
# Deserialize from PROV-JSON
doc = ProvDocument.deserialize('input.json', format='json')XML representation following the W3C PROV-XML schema.
# Serialize to PROV-XML (requires lxml)
doc.serialize('output.xml', format='xml')
# Deserialize from PROV-XML
doc = ProvDocument.deserialize('input.xml', format='xml')RDF representation using the W3C PROV ontology.
# Serialize to RDF (requires rdflib)
doc.serialize('output.rdf', format='rdf')
doc.serialize('output.ttl', format='rdf', rdf_format='turtle')
doc.serialize('output.n3', format='rdf', rdf_format='n3')
# Deserialize from RDF
doc = ProvDocument.deserialize('input.rdf', format='rdf')
doc = ProvDocument.deserialize('input.ttl', format='rdf', rdf_format='turtle')Human-readable textual notation defined by W3C.
# Serialize to PROV-N
doc.serialize('output.provn', format='provn')
# Get PROV-N as string
provn_string = doc.get_provn()
# Deserialize from PROV-N
doc = ProvDocument.deserialize('input.provn', format='provn')from prov.model import ProvDocument
import prov
# Create a document with some content
doc = ProvDocument()
doc.add_namespace('ex', 'http://example.org/')
entity = doc.entity('ex:entity1', {'prov:label': 'Example Entity'})
activity = doc.activity('ex:activity1')
doc.generation(entity, activity)
# Serialize to different formats
doc.serialize('output.json', format='json')
doc.serialize('output.xml', format='xml')
doc.serialize('output.rdf', format='rdf')
doc.serialize('output.provn', format='provn')
# Serialize to string
json_string = doc.serialize(format='json')
xml_string = doc.serialize(format='xml')# Read with automatic format detection
doc1 = prov.read('document.json') # Auto-detects JSON
doc2 = prov.read('document.xml') # Auto-detects XML
doc3 = prov.read('document.rdf') # Auto-detects RDF
# Read with explicit format
doc4 = prov.read('document.txt', format='provn')
# Read from file-like objects
with open('document.json', 'r') as f:
doc5 = prov.read(f, format='json')# PROV-JSON with pretty printing
doc.serialize('pretty.json', format='json', indent=4)
# RDF with specific format
doc.serialize('output.ttl', format='rdf', rdf_format='turtle')
doc.serialize('output.nt', format='rdf', rdf_format='nt')
# Using serializer classes directly
from prov.serializers import get
json_serializer = get('json')(doc)
with open('output.json', 'w') as f:
json_serializer.serialize(f, indent=2)from prov.serializers import DoNotExist
try:
# Attempt to read with format detection
doc = prov.read('unknown_format.dat')
except TypeError as e:
print(f"Format detection failed: {e}")
# Try with explicit format
doc = prov.read('unknown_format.dat', format='json')
try:
# Attempt to get unavailable serializer
serializer = get('unsupported_format')
except DoNotExist as e:
print(f"Serializer not available: {e}")import io
# Serialize to string buffer
buffer = io.StringIO()
doc.serialize(buffer, format='json')
json_content = buffer.getvalue()
# Deserialize from string buffer
input_buffer = io.StringIO(json_content)
loaded_doc = ProvDocument.deserialize(input_buffer, format='json')
# Binary formats (for some RDF serializations)
binary_buffer = io.BytesIO()
doc.serialize(binary_buffer, format='rdf', rdf_format='xml')import os
# Serialize document to multiple formats
formats = ['json', 'xml', 'rdf', 'provn']
base_name = 'provenance'
for fmt in formats:
filename = f"{base_name}.{fmt}"
try:
doc.serialize(filename, format=fmt)
print(f"Saved {filename}")
except Exception as e:
print(f"Failed to save {filename}: {e}")
# Read and convert between formats
def convert_format(input_file, output_file, output_format):
"""Convert PROV document between formats."""
doc = prov.read(input_file)
doc.serialize(output_file, format=output_format)
# Convert JSON to XML
convert_format('input.json', 'output.xml', 'xml')# For large documents, serialize directly to file
with open('large_document.json', 'w') as f:
doc.serialize(f, format='json')
# Stream processing for large RDF documents
def process_large_rdf(filename):
"""Process large RDF document efficiently."""
doc = ProvDocument.deserialize(filename, format='rdf')
# Process in chunks or specific record types
entities = doc.get_records(prov.model.ProvEntity)
activities = doc.get_records(prov.model.ProvActivity)
print(f"Found {len(entities)} entities and {len(activities)} activities")# JSON serialization options
doc.serialize('compact.json', format='json', separators=(',', ':'))
doc.serialize('readable.json', format='json', indent=4, sort_keys=True)
# RDF serialization with base URI
doc.serialize('output.rdf', format='rdf',
rdf_format='turtle',
base='http://example.org/')
# XML serialization with encoding
doc.serialize('output.xml', format='xml', encoding='utf-8')Install with Tessl CLI
npx tessl i tessl/pypi-prov