Python implementation of the JSON-LD API for processing Linked Data in JSON format
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
RFC 8785 compliant JSON canonicalization for consistent JSON serialization, hashing, and digital signatures. The c14n module provides deterministic JSON string representation.
Produces canonical JSON string representation according to RFC 8785 standards.
def canonicalize(obj, utf8=True):
"""
Canonicalizes a JSON object according to RFC 8785.
Produces a deterministic string representation of JSON data by:
- Sorting object keys lexicographically
- Using minimal whitespace (no extra spaces)
- Consistent number formatting
- Proper Unicode escape sequences
Args:
obj: The JSON-serializable object to canonicalize (dict, list, str,
int, float, bool, None)
utf8 (bool): If True, return bytes encoded as UTF-8; if False,
return Unicode string (default: True)
Returns:
bytes or str: Canonical JSON representation (bytes if utf8=True,
str if utf8=False)
Raises:
TypeError: If obj contains non-JSON-serializable types
ValueError: If obj contains circular references
"""from c14n import canonicalize
import json
# Basic canonicalization
data = {"name": "Alice", "age": 30, "city": "New York"}
canonical_bytes = canonicalize(data)
print(canonical_bytes) # b'{"age":30,"city":"New York","name":"Alice"}'
# Get string instead of bytes
canonical_str = canonicalize(data, utf8=False)
print(canonical_str) # '{"age":30,"city":"New York","name":"Alice"}'
# Complex nested structure
complex_data = {
"users": [
{"id": 2, "name": "Bob"},
{"id": 1, "name": "Alice"}
],
"metadata": {
"version": "1.0",
"created": "2023-01-01"
}
}
canonical = canonicalize(complex_data, utf8=False)
print(canonical)
# Output: {"metadata":{"created":"2023-01-01","version":"1.0"},"users":[{"id":2,"name":"Bob"},{"id":1,"name":"Alice"}]}Alternative serialization function without key sorting (non-canonical).
def serialize(obj, utf8=True):
"""
Serializes JSON object without canonicalization (preserves key order).
Args:
obj: The JSON-serializable object to serialize
utf8 (bool): If True, return bytes encoded as UTF-8; if False,
return Unicode string (default: True)
Returns:
bytes or str: JSON representation without key reordering
Raises:
TypeError: If obj contains non-JSON-serializable types
ValueError: If obj contains circular references
"""from c14n import serialize
data = {"name": "Alice", "age": 30, "city": "New York"}
# Serialize preserving original key order
serialized = serialize(data, utf8=False)
print(serialized) # '{"name":"Alice","age":30,"city":"New York"}'
# Compare with canonicalization (keys sorted)
canonical = canonicalize(data, utf8=False)
print(canonical) # '{"age":30,"city":"New York","name":"Alice"}'Object keys are sorted lexicographically using Unicode code points:
data = {
"zebra": 1,
"apple": 2,
"banana": 3,
"Apple": 4 # Capital A comes before lowercase a
}
canonical = canonicalize(data, utf8=False)
# Result: {"Apple":4,"apple":2,"banana":3,"zebra":1}Numbers are formatted in their minimal representation:
numbers = {
"integer": 42,
"float": 3.14159,
"zero": 0,
"negative": -123,
"scientific": 1.23e-4
}
canonical = canonicalize(numbers, utf8=False)
# Numbers formatted without unnecessary precision or notationStrings are properly escaped with minimal escape sequences:
strings = {
"quote": 'He said "Hello"',
"newline": "Line 1\nLine 2",
"unicode": "café",
"control": "tab\there"
}
canonical = canonicalize(strings, utf8=False)
# Proper JSON string escaping appliedArray element order is preserved (not sorted):
data = {
"numbers": [3, 1, 4, 1, 5],
"mixed": ["zebra", "apple", "banana"]
}
canonical = canonicalize(data, utf8=False)
# Array order maintained: {"mixed":["zebra","apple","banana"],"numbers":[3,1,4,1,5]}from c14n import canonicalize
import hashlib
import hmac
def sign_json(data, secret_key):
"""Create digital signature of JSON data."""
canonical_bytes = canonicalize(data)
signature = hmac.new(secret_key, canonical_bytes, hashlib.sha256).hexdigest()
return signature
def verify_json(data, signature, secret_key):
"""Verify digital signature of JSON data."""
canonical_bytes = canonicalize(data)
expected_signature = hmac.new(secret_key, canonical_bytes, hashlib.sha256).hexdigest()
return hmac.compare_digest(signature, expected_signature)
# Example usage
document = {"user": "alice", "action": "login", "timestamp": "2023-01-01T12:00:00Z"}
secret = b"my-secret-key"
signature = sign_json(document, secret)
is_valid = verify_json(document, signature, secret)import hashlib
from c14n import canonicalize
def hash_json(data):
"""Create deterministic hash of JSON data."""
canonical_bytes = canonicalize(data)
return hashlib.sha256(canonical_bytes).hexdigest()
# Same data in different orders produces same hash
data1 = {"name": "Alice", "age": 30}
data2 = {"age": 30, "name": "Alice"}
hash1 = hash_json(data1)
hash2 = hash_json(data2)
print(hash1 == hash2) # True - same canonical representationfrom c14n import canonicalize
def deduplicate_json(json_objects):
"""Remove duplicate JSON objects based on canonical form."""
seen = set()
unique_objects = []
for obj in json_objects:
canonical = canonicalize(obj)
if canonical not in seen:
seen.add(canonical)
unique_objects.append(obj)
return unique_objects
# Example with duplicate data in different order
objects = [
{"name": "Alice", "age": 30},
{"age": 30, "name": "Alice"}, # Duplicate in different order
{"name": "Bob", "age": 25}
]
unique = deduplicate_json(objects)
# Returns: [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}]from pyld import jsonld
from c14n import canonicalize
def canonical_json_ld_hash(doc):
"""Create hash of JSON-LD document after normalization and canonicalization."""
# First normalize with JSON-LD
normalized = jsonld.normalize(doc, {
'algorithm': 'URDNA2015',
'format': 'application/n-quads'
})
# Then canonicalize the normalized form
canonical = canonicalize(normalized)
return hashlib.sha256(canonical).hexdigest()The canonicalization follows RFC 8785 specifications:
# For large objects, canonicalization creates string representation in memory
large_data = {"items": list(range(100000))}
canonical = canonicalize(large_data) # Creates large string in memoryfrom functools import lru_cache
@lru_cache(maxsize=1000)
def cached_canonicalize(data_str):
"""Cache canonical forms for frequently used data."""
import json
data = json.loads(data_str)
return canonicalize(data, utf8=False)
# Use with JSON string input for caching
data_json = '{"name": "Alice", "age": 30}'
canonical = cached_canonicalize(data_json)Canonicalization functions may raise standard Python JSON errors:
from c14n import canonicalize
import json
try:
# This will fail - functions aren't JSON serializable
invalid_data = {"func": lambda x: x}
canonical = canonicalize(invalid_data)
except TypeError as e:
print(f"Serialization error: {e}")
try:
# This will fail - circular reference
circular = {}
circular["self"] = circular
canonical = canonicalize(circular)
except ValueError as e:
print(f"Circular reference error: {e}")The c14n module is used internally by PyLD for JSON-LD processing:
# PyLD uses canonicalization in normalization algorithms
from pyld import jsonld
doc = {"@context": {...}, "@id": "example:1", "name": "Test"}
normalized = jsonld.normalize(doc, {
'algorithm': 'URDNA2015',
'format': 'application/n-quads'
})
# Internally uses canonicalization for consistent RDF representationInstall with Tessl CLI
npx tessl i tessl/pypi-pyld