Read and write PDFs with Python, powered by qpdf
—
Document metadata, XMP data, and PDF properties including titles, authors, creation dates, and custom metadata fields. These capabilities enable comprehensive document information management and standards compliance.
Comprehensive XMP metadata management with PDF/A compliance and standards support.
class PdfMetadata:
"""
XMP metadata handler for PDF documents.
Provides access to document metadata following the XMP (Extensible Metadata Platform)
standard, with support for Dublin Core, PDF, and custom metadata schemas.
"""
def __init__(self, pdf: Pdf, *, sync_docinfo: bool = True) -> None:
"""
Create a metadata handler for a PDF document.
Parameters:
- pdf (Pdf): PDF document to manage metadata for
- sync_docinfo (bool): Automatically synchronize with document info dictionary
Raises:
DependencyError: If required XMP libraries are not available
"""
@property
def pdfa_status(self) -> str:
"""
PDF/A compliance status of the document.
Returns:
str: PDF/A status ('1A', '1B', '2A', '2B', '2U', '3A', '3B', '3U', or empty if not PDF/A)
"""
def load_from_docinfo(self, docinfo: Dictionary, *, delete_missing: bool = False) -> None:
"""
Load metadata from a document info dictionary.
Parameters:
- docinfo (Dictionary): Document info dictionary to load from
- delete_missing (bool): Delete existing metadata not found in docinfo
"""
def save_to_docinfo(self, docinfo: Dictionary) -> None:
"""
Save metadata to a document info dictionary.
Parameters:
- docinfo (Dictionary): Document info dictionary to update
"""
@property
def title(self) -> str:
"""
Document title.
Returns:
str: Title of the document
"""
@title.setter
def title(self, value: str) -> None:
"""Set document title."""
@property
def author(self) -> str:
"""
Document author.
Returns:
str: Author name or names
"""
@author.setter
def author(self, value: str) -> None:
"""Set document author."""
@property
def subject(self) -> str:
"""
Document subject or description.
Returns:
str: Subject description
"""
@subject.setter
def subject(self, value: str) -> None:
"""Set document subject."""
@property
def keywords(self) -> str:
"""
Document keywords.
Returns:
str: Keywords (typically comma-separated)
"""
@keywords.setter
def keywords(self, value: str) -> None:
"""Set document keywords."""
@property
def creator(self) -> str:
"""
Application that created the original document.
Returns:
str: Name of creating application
"""
@creator.setter
def creator(self, value: str) -> None:
"""Set document creator."""
@property
def producer(self) -> str:
"""
Application that converted/produced the PDF.
Returns:
str: Name of PDF producing application
"""
@producer.setter
def producer(self, value: str) -> None:
"""Set document producer."""
@property
def creation_date(self) -> str:
"""
Document creation date in ISO format.
Returns:
str: Creation date (ISO 8601 format)
"""
@creation_date.setter
def creation_date(self, value: str) -> None:
"""Set document creation date."""
@property
def modification_date(self) -> str:
"""
Document modification date in ISO format.
Returns:
str: Last modification date (ISO 8601 format)
"""
@modification_date.setter
def modification_date(self, value: str) -> None:
"""Set document modification date."""Direct access to PDF document information dictionary for legacy metadata.
# Accessed via pdf.docinfo property
class DocumentInfo(Dictionary):
"""
PDF document information dictionary.
Legacy metadata storage using PDF's built-in document info dictionary.
Modern documents should use XMP metadata, but this provides compatibility.
"""
# Standard document info entries (accessed as dictionary keys):
# '/Title': Document title
# '/Author': Document author
# '/Subject': Document subject
# '/Keywords': Document keywords
# '/Creator': Creating application
# '/Producer': PDF producer application
# '/CreationDate': Creation date (PDF date format)
# '/ModDate': Modification date (PDF date format)
# '/Trapped': Trapping status (/True, /False, /Unknown)Specialized exceptions for metadata operations.
class DependencyError(Exception):
"""
Raised when required metadata processing libraries are missing.
Metadata operations may require additional Python packages
for XMP processing and date handling.
"""import pikepdf
from datetime import datetime
# Open or create a PDF
pdf = pikepdf.open('document.pdf')
# Access document info dictionary (legacy metadata)
docinfo = pdf.docinfo
# Read existing metadata
print("Current metadata:")
print(f"Title: {docinfo.get('/Title', 'No title')}")
print(f"Author: {docinfo.get('/Author', 'No author')}")
print(f"Subject: {docinfo.get('/Subject', 'No subject')}")
print(f"Keywords: {docinfo.get('/Keywords', 'No keywords')}")
print(f"Creator: {docinfo.get('/Creator', 'No creator')}")
print(f"Producer: {docinfo.get('/Producer', 'No producer')}")
# Update metadata
docinfo['/Title'] = pikepdf.String('Updated Document Title')
docinfo['/Author'] = pikepdf.String('Jane Doe')
docinfo['/Subject'] = pikepdf.String('Technical Documentation')
docinfo['/Keywords'] = pikepdf.String('PDF, documentation, technical, guide')
docinfo['/Creator'] = pikepdf.String('Python Script')
# Set creation and modification dates
current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")
docinfo['/CreationDate'] = pikepdf.String(current_date)
docinfo['/ModDate'] = pikepdf.String(current_date)
pdf.save('updated_metadata.pdf')
pdf.close()import pikepdf
from datetime import datetime
# Open PDF and access XMP metadata
pdf = pikepdf.open('document.pdf')
try:
# Create XMP metadata handler
metadata = pikepdf.PdfMetadata(pdf)
print("XMP Metadata:")
print(f"Title: {metadata.title}")
print(f"Author: {metadata.author}")
print(f"Subject: {metadata.subject}")
print(f"Keywords: {metadata.keywords}")
print(f"Creator: {metadata.creator}")
print(f"Producer: {metadata.producer}")
print(f"Creation Date: {metadata.creation_date}")
print(f"Modification Date: {metadata.modification_date}")
print(f"PDF/A Status: {metadata.pdfa_status}")
# Update XMP metadata
metadata.title = "Comprehensive PDF Guide"
metadata.author = "Technical Writing Team"
metadata.subject = "Complete guide to PDF operations using pikepdf"
metadata.keywords = "PDF, Python, pikepdf, documentation, tutorial"
metadata.creator = "Python Documentation Generator"
# Set dates in ISO format
now = datetime.now().isoformat()
metadata.creation_date = now
metadata.modification_date = now
# Synchronize XMP with document info
metadata.save_to_docinfo(pdf.docinfo)
pdf.save('xmp_updated.pdf')
print("XMP metadata updated successfully")
except pikepdf.DependencyError:
print("XMP processing libraries not available - using basic metadata only")
# Fall back to basic document info
docinfo = pdf.docinfo
docinfo['/Title'] = pikepdf.String("Comprehensive PDF Guide")
docinfo['/Author'] = pikepdf.String("Technical Writing Team")
pdf.save('basic_metadata_updated.pdf')
pdf.close()import pikepdf
from datetime import datetime
def create_pdfa_compliant_document():
"""Create a PDF/A compliant document with proper metadata."""
pdf = pikepdf.new()
page = pdf.add_blank_page()
# Add minimal content
content = """
BT
/F1 12 Tf
100 700 Td
(PDF/A Compliant Document) Tj
ET
"""
content_stream = pikepdf.Stream(pdf, content.encode())
page['/Contents'] = content_stream
try:
# Set up XMP metadata for PDF/A compliance
metadata = pikepdf.PdfMetadata(pdf)
# Required metadata for PDF/A
metadata.title = "PDF/A Compliant Document"
metadata.author = "Document Generator"
metadata.subject = "Sample PDF/A document with complete metadata"
metadata.keywords = "PDF/A, compliance, archival, standard"
metadata.creator = "Python pikepdf library"
metadata.producer = f"pikepdf {pikepdf.__version__}"
# Set required dates
now = datetime.now().isoformat()
metadata.creation_date = now
metadata.modification_date = now
# Synchronize with document info
metadata.save_to_docinfo(pdf.docinfo)
# Additional PDF/A requirements would include:
# - Embedded fonts
# - Color profile
# - Proper XMP packet
# - No encryption
# - No external dependencies
pdf.save('pdfa_compliant.pdf')
print(f"Created PDF/A compliant document with metadata")
print(f"PDF/A Status: {metadata.pdfa_status}")
except pikepdf.DependencyError:
print("XMP libraries not available - cannot create full PDF/A compliance")
pdf.close()
create_pdfa_compliant_document()import pikepdf
from pathlib import Path
from datetime import datetime
def analyze_pdf_metadata(pdf_path):
"""Analyze metadata in a PDF file."""
try:
pdf = pikepdf.open(pdf_path)
analysis = {
'file': str(pdf_path),
'file_size': pdf_path.stat().st_size,
'pages': len(pdf.pages),
'pdf_version': pdf.pdf_version,
'is_encrypted': pdf.is_encrypted
}
# Document info metadata
docinfo = pdf.docinfo
analysis['docinfo'] = {
'title': str(docinfo.get('/Title', '')),
'author': str(docinfo.get('/Author', '')),
'subject': str(docinfo.get('/Subject', '')),
'keywords': str(docinfo.get('/Keywords', '')),
'creator': str(docinfo.get('/Creator', '')),
'producer': str(docinfo.get('/Producer', '')),
'creation_date': str(docinfo.get('/CreationDate', '')),
'modification_date': str(docinfo.get('/ModDate', '')),
'trapped': str(docinfo.get('/Trapped', ''))
}
# Try XMP metadata
try:
metadata = pikepdf.PdfMetadata(pdf)
analysis['xmp'] = {
'title': metadata.title,
'author': metadata.author,
'subject': metadata.subject,
'keywords': metadata.keywords,
'creator': metadata.creator,
'producer': metadata.producer,
'creation_date': metadata.creation_date,
'modification_date': metadata.modification_date,
'pdfa_status': metadata.pdfa_status
}
analysis['has_xmp'] = True
except pikepdf.DependencyError:
analysis['has_xmp'] = False
analysis['xmp_error'] = "XMP libraries not available"
except Exception as e:
analysis['has_xmp'] = False
analysis['xmp_error'] = str(e)
pdf.close()
return analysis
except Exception as e:
return {'file': str(pdf_path), 'error': str(e)}
def metadata_report(directory_path):
"""Generate a comprehensive metadata report for PDFs in a directory."""
directory = Path(directory_path)
pdf_files = list(directory.glob('*.pdf'))
print(f"PDF Metadata Report for: {directory}")
print("=" * 80)
for pdf_file in pdf_files:
analysis = analyze_pdf_metadata(pdf_file)
if 'error' in analysis:
print(f"\n❌ {pdf_file.name}: {analysis['error']}")
continue
print(f"\n📄 {pdf_file.name}")
print(f" Size: {analysis['file_size']:,} bytes, "
f"Pages: {analysis['pages']}, "
f"Version: {analysis['pdf_version']}")
if analysis['is_encrypted']:
print(f" 🔒 ENCRYPTED")
# Document Info metadata
docinfo = analysis['docinfo']
if any(docinfo.values()):
print(f" Document Info:")
if docinfo['title']: print(f" Title: {docinfo['title']}")
if docinfo['author']: print(f" Author: {docinfo['author']}")
if docinfo['creator']: print(f" Creator: {docinfo['creator']}")
if docinfo['producer']: print(f" Producer: {docinfo['producer']}")
if docinfo['creation_date']: print(f" Created: {docinfo['creation_date']}")
if docinfo['modification_date']: print(f" Modified: {docinfo['modification_date']}")
else:
print(f" 📋 No Document Info metadata")
# XMP metadata
if analysis['has_xmp']:
xmp = analysis['xmp']
if any([xmp['title'], xmp['author'], xmp['subject']]):
print(f" XMP Metadata:")
if xmp['title']: print(f" Title: {xmp['title']}")
if xmp['author']: print(f" Author: {xmp['author']}")
if xmp['subject']: print(f" Subject: {xmp['subject']}")
if xmp['pdfa_status']: print(f" PDF/A: {xmp['pdfa_status']}")
else:
print(f" 📋 XMP present but minimal")
elif 'xmp_error' in analysis:
print(f" ⚠️ XMP: {analysis['xmp_error']}")
# Generate metadata report
# metadata_report('.')import pikepdf
from pathlib import Path
from datetime import datetime
def standardize_metadata(directory_path, template_metadata):
"""Standardize metadata across multiple PDF files."""
directory = Path(directory_path)
pdf_files = list(directory.glob('*.pdf'))
results = {'updated': [], 'failed': [], 'skipped': []}
for pdf_file in pdf_files:
try:
# Skip encrypted files
pdf = pikepdf.open(pdf_file)
if pdf.is_encrypted:
results['skipped'].append((str(pdf_file), "Encrypted"))
pdf.close()
continue
# Update document info
docinfo = pdf.docinfo
# Apply template metadata
if template_metadata.get('author'):
docinfo['/Author'] = pikepdf.String(template_metadata['author'])
if template_metadata.get('creator'):
docinfo['/Creator'] = pikepdf.String(template_metadata['creator'])
if template_metadata.get('producer'):
docinfo['/Producer'] = pikepdf.String(template_metadata['producer'])
# Update modification date
current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")
docinfo['/ModDate'] = pikepdf.String(current_date)
# Preserve existing title if present, otherwise use filename
if not docinfo.get('/Title'):
title = pdf_file.stem.replace('_', ' ').replace('-', ' ').title()
docinfo['/Title'] = pikepdf.String(title)
# Try XMP update if available
try:
metadata = pikepdf.PdfMetadata(pdf)
if template_metadata.get('author'):
metadata.author = template_metadata['author']
if template_metadata.get('creator'):
metadata.creator = template_metadata['creator']
if template_metadata.get('producer'):
metadata.producer = template_metadata['producer']
metadata.modification_date = datetime.now().isoformat()
metadata.save_to_docinfo(docinfo)
except pikepdf.DependencyError:
pass # XMP not available, document info is sufficient
# Save changes
pdf.save()
pdf.close()
results['updated'].append(str(pdf_file))
except Exception as e:
results['failed'].append((str(pdf_file), str(e)))
try:
pdf.close()
except:
pass
print(f"Metadata standardization complete:")
print(f" Updated: {len(results['updated'])} files")
print(f" Failed: {len(results['failed'])} files")
print(f" Skipped: {len(results['skipped'])} files")
return results
# Standardize metadata with template
template = {
'author': 'Corporate Documentation Team',
'creator': 'Document Management System',
'producer': f'pikepdf {pikepdf.__version__}'
}
# results = standardize_metadata('.', template)import pikepdf
def add_custom_metadata(pdf_path, custom_fields):
"""Add custom metadata fields to a PDF."""
pdf = pikepdf.open(pdf_path)
docinfo = pdf.docinfo
# Add custom fields to document info
for field_name, field_value in custom_fields.items():
# Custom fields should use proper PDF name format
pdf_field_name = f'/{field_name}'
docinfo[pdf_field_name] = pikepdf.String(str(field_value))
# Also try to add to XMP if available
try:
metadata = pikepdf.PdfMetadata(pdf)
# Custom XMP properties would require namespace registration
# For basic use, document info is sufficient
metadata.save_to_docinfo(docinfo)
except pikepdf.DependencyError:
pass
pdf.save()
pdf.close()
print(f"Added custom metadata to {pdf_path}")
# Add custom metadata
custom_metadata = {
'Department': 'Engineering',
'Project': 'API Documentation',
'Version': '2.1.0',
'Status': 'Final',
'ReviewedBy': 'Technical Lead',
'ApprovalDate': '2024-09-10',
'DocumentID': 'DOC-2024-001',
'SecurityClass': 'Internal'
}
# add_custom_metadata('document.pdf', custom_metadata)
def extract_custom_metadata(pdf_path):
"""Extract and display all metadata including custom fields."""
pdf = pikepdf.open(pdf_path)
docinfo = pdf.docinfo
print(f"All metadata for: {pdf_path}")
print("=" * 50)
# Standard fields
standard_fields = ['/Title', '/Author', '/Subject', '/Keywords',
'/Creator', '/Producer', '/CreationDate', '/ModDate', '/Trapped']
print("Standard Fields:")
for field in standard_fields:
if field in docinfo:
print(f" {field[1:]}: {docinfo[field]}")
# Custom fields (anything not in standard list)
custom_fields = [key for key in docinfo.keys() if key not in standard_fields]
if custom_fields:
print("\nCustom Fields:")
for field in custom_fields:
print(f" {field[1:]}: {docinfo[field]}")
else:
print("\nNo custom fields found")
pdf.close()
# Extract all metadata including custom fields
# extract_custom_metadata('document.pdf')Install with Tessl CLI
npx tessl i tessl/pypi-pikepdf