CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-prov

A library for W3C Provenance Data Model supporting PROV-JSON, PROV-XML and PROV-O (RDF)

Pending
Overview
Eval results
Files

serialization.mddocs/

Serialization and Formats

Comprehensive serialization support for multiple PROV formats including PROV-JSON, PROV-XML, PROV-O (RDF), and PROV-N, with automatic format detection and pluggable serializer architecture.

Capabilities

Serializer Base Class

Abstract base class for all format-specific serializers.

class Serializer:
    def __init__(self, document=None):
        """
        Create a serializer for PROV documents.
        
        Args:
            document (ProvDocument, optional): Document to serialize
        """
    
    def serialize(self, stream, **args):
        """
        Abstract method for serializing a document.
        
        Args:
            stream (file-like): Stream object to serialize the document into
            **args: Format-specific serialization arguments
        """
    
    def deserialize(self, stream, **args):
        """
        Abstract method for deserializing a document.
        
        Args:
            stream (file-like): Stream object to deserialize the document from
            **args: Format-specific deserialization arguments
            
        Returns:
            ProvDocument: Deserialized document
        """

Serializer Registry

Registry system for managing available serializers.

class Registry:
    serializers: dict[str, type[Serializer]] = None
    """Dictionary mapping format names to serializer classes."""
    
    @staticmethod
    def load_serializers():
        """
        Load all available serializers into the registry.
        
        Registers serializers for:
        - 'json': PROV-JSON format
        - 'xml': PROV-XML format  
        - 'rdf': PROV-O (RDF) format
        - 'provn': PROV-N format
        """

def get(format_name: str) -> type[Serializer]:
    """
    Get the serializer class for the specified format.
    
    Args:
        format_name (str): Format name ('json', 'xml', 'rdf', 'provn')
        
    Returns:
        type[Serializer]: Serializer class for the format
        
    Raises:
        DoNotExist: If no serializer is available for the format
    """

class DoNotExist(Exception):
    """Exception raised when a serializer is not available for a format."""

Document Serialization Methods

ProvDocument provides high-level serialization methods.

class ProvDocument:
    def serialize(self, destination=None, format='json', **args):
        """
        Serialize this document to various formats.
        
        Args:
            destination (str or file-like, optional): Output destination
            format (str): Output format ('json', 'xml', 'rdf', 'provn')
            **args: Format-specific arguments
            
        Returns:
            str: Serialized content if no destination specified
        """
    
    @staticmethod  
    def deserialize(source, format=None, **args):
        """
        Deserialize a document from various formats.
        
        Args:
            source (str or file-like): Input source
            format (str, optional): Input format, auto-detected if None
            **args: Format-specific arguments
            
        Returns:
            ProvDocument: Deserialized document
        """

Format-Specific Serializers

Individual serializer classes for each supported format.

class ProvJSONSerializer(Serializer):
    """
    Serializer for PROV-JSON format.
    
    PROV-JSON represents provenance as JSON objects with arrays for
    different record types and attributes.
    """

class ProvXMLSerializer(Serializer):
    """
    Serializer for PROV-XML format.
    
    PROV-XML represents provenance as XML documents following the
    W3C PROV-XML schema.
    
    Requirements:
        lxml>=3.3.5 (install with: pip install prov[xml])
    """

class ProvRDFSerializer(Serializer):
    """
    Serializer for PROV-O (RDF) format.
    
    PROV-O represents provenance as RDF triples using the W3C PROV
    ontology vocabulary.
    
    Requirements:
        rdflib>=4.2.1,<7 (install with: pip install prov[rdf])
    """

class ProvNSerializer(Serializer):
    """
    Serializer for PROV-N format.
    
    PROV-N is the human-readable textual notation for PROV defined
    by the W3C specification.
    """

Convenience Functions

High-level functions for easy serialization/deserialization.

def read(source, format=None):
    """
    Convenience function for reading PROV documents with automatic format detection.
    
    Args:
        source (str or PathLike or file-like): Source to read from
        format (str, optional): Format hint for parsing
        
    Returns:
        ProvDocument: Loaded document or None
        
    Raises:
        TypeError: If format cannot be detected and parsing fails
    """

Supported Formats

PROV-JSON

JSON representation of PROV documents with structured objects for each record type.

# Serialize to PROV-JSON
doc.serialize('output.json', format='json')
doc.serialize('output.json', format='json', indent=2)  # Pretty-printed

# Deserialize from PROV-JSON
doc = ProvDocument.deserialize('input.json', format='json')

PROV-XML

XML representation following the W3C PROV-XML schema.

# Serialize to PROV-XML (requires lxml)
doc.serialize('output.xml', format='xml')

# Deserialize from PROV-XML
doc = ProvDocument.deserialize('input.xml', format='xml')

PROV-O (RDF)

RDF representation using the W3C PROV ontology.

# Serialize to RDF (requires rdflib)
doc.serialize('output.rdf', format='rdf')
doc.serialize('output.ttl', format='rdf', rdf_format='turtle')
doc.serialize('output.n3', format='rdf', rdf_format='n3')

# Deserialize from RDF
doc = ProvDocument.deserialize('input.rdf', format='rdf')
doc = ProvDocument.deserialize('input.ttl', format='rdf', rdf_format='turtle')

PROV-N

Human-readable textual notation defined by W3C.

# Serialize to PROV-N
doc.serialize('output.provn', format='provn')

# Get PROV-N as string
provn_string = doc.get_provn()

# Deserialize from PROV-N
doc = ProvDocument.deserialize('input.provn', format='provn')

Usage Examples

Basic Serialization

from prov.model import ProvDocument
import prov

# Create a document with some content
doc = ProvDocument()
doc.add_namespace('ex', 'http://example.org/')

entity = doc.entity('ex:entity1', {'prov:label': 'Example Entity'})
activity = doc.activity('ex:activity1')
doc.generation(entity, activity)

# Serialize to different formats
doc.serialize('output.json', format='json')
doc.serialize('output.xml', format='xml')
doc.serialize('output.rdf', format='rdf')
doc.serialize('output.provn', format='provn')

# Serialize to string
json_string = doc.serialize(format='json')
xml_string = doc.serialize(format='xml')

Reading Documents

# Read with automatic format detection
doc1 = prov.read('document.json')      # Auto-detects JSON
doc2 = prov.read('document.xml')       # Auto-detects XML
doc3 = prov.read('document.rdf')       # Auto-detects RDF

# Read with explicit format
doc4 = prov.read('document.txt', format='provn')

# Read from file-like objects
with open('document.json', 'r') as f:
    doc5 = prov.read(f, format='json')

Advanced Serialization Options

# PROV-JSON with pretty printing
doc.serialize('pretty.json', format='json', indent=4)

# RDF with specific format
doc.serialize('output.ttl', format='rdf', rdf_format='turtle')
doc.serialize('output.nt', format='rdf', rdf_format='nt')

# Using serializer classes directly
from prov.serializers import get

json_serializer = get('json')(doc)
with open('output.json', 'w') as f:
    json_serializer.serialize(f, indent=2)

Format Detection and Error Handling

from prov.serializers import DoNotExist

try:
    # Attempt to read with format detection
    doc = prov.read('unknown_format.dat')
except TypeError as e:
    print(f"Format detection failed: {e}")
    # Try with explicit format
    doc = prov.read('unknown_format.dat', format='json')

try:
    # Attempt to get unavailable serializer
    serializer = get('unsupported_format')
except DoNotExist as e:
    print(f"Serializer not available: {e}")

Working with Streams

import io

# Serialize to string buffer
buffer = io.StringIO()
doc.serialize(buffer, format='json')
json_content = buffer.getvalue()

# Deserialize from string buffer
input_buffer = io.StringIO(json_content)
loaded_doc = ProvDocument.deserialize(input_buffer, format='json')

# Binary formats (for some RDF serializations)
binary_buffer = io.BytesIO()
doc.serialize(binary_buffer, format='rdf', rdf_format='xml')

Batch Processing

import os

# Serialize document to multiple formats
formats = ['json', 'xml', 'rdf', 'provn']
base_name = 'provenance'

for fmt in formats:
    filename = f"{base_name}.{fmt}"
    try:
        doc.serialize(filename, format=fmt)
        print(f"Saved {filename}")
    except Exception as e:
        print(f"Failed to save {filename}: {e}")

# Read and convert between formats
def convert_format(input_file, output_file, output_format):
    """Convert PROV document between formats."""
    doc = prov.read(input_file)
    doc.serialize(output_file, format=output_format)

# Convert JSON to XML
convert_format('input.json', 'output.xml', 'xml')

Handling Large Documents

# For large documents, serialize directly to file
with open('large_document.json', 'w') as f:
    doc.serialize(f, format='json')

# Stream processing for large RDF documents
def process_large_rdf(filename):
    """Process large RDF document efficiently."""
    doc = ProvDocument.deserialize(filename, format='rdf')
    
    # Process in chunks or specific record types
    entities = doc.get_records(prov.model.ProvEntity)
    activities = doc.get_records(prov.model.ProvActivity)
    
    print(f"Found {len(entities)} entities and {len(activities)} activities")

Custom Serialization Parameters

# JSON serialization options
doc.serialize('compact.json', format='json', separators=(',', ':'))
doc.serialize('readable.json', format='json', indent=4, sort_keys=True)

# RDF serialization with base URI
doc.serialize('output.rdf', format='rdf', 
              rdf_format='turtle', 
              base='http://example.org/')

# XML serialization with encoding
doc.serialize('output.xml', format='xml', encoding='utf-8')

Install with Tessl CLI

npx tessl i tessl/pypi-prov

docs

document-management.md

identifiers.md

index.md

prov-elements.md

relationships.md

serialization.md

visualization.md

tile.json