tessl/pypi-pikepdf

Read and write PDFs with Python, powered by qpdf

—

Pending

Overview

Eval results

Files

Metadata and Document Properties

Name: tessl/pypi-pikepdf
Author: tessl

Document metadata, XMP data, and PDF properties including titles, authors, creation dates, and custom metadata fields. These capabilities enable comprehensive document information management and standards compliance.

Capabilities

PdfMetadata Class

Comprehensive XMP metadata management with PDF/A compliance and standards support.

class PdfMetadata:
    """
    XMP metadata handler for PDF documents.
    
    Provides access to document metadata following the XMP (Extensible Metadata Platform)
    standard, with support for Dublin Core, PDF, and custom metadata schemas.
    """
    
    def __init__(self, pdf: Pdf, *, sync_docinfo: bool = True) -> None:
        """
        Create a metadata handler for a PDF document.
        
        Parameters:
        - pdf (Pdf): PDF document to manage metadata for
        - sync_docinfo (bool): Automatically synchronize with document info dictionary
        
        Raises:
        DependencyError: If required XMP libraries are not available
        """
    
    @property
    def pdfa_status(self) -> str:
        """
        PDF/A compliance status of the document.
        
        Returns:
        str: PDF/A status ('1A', '1B', '2A', '2B', '2U', '3A', '3B', '3U', or empty if not PDF/A)
        """
    
    def load_from_docinfo(self, docinfo: Dictionary, *, delete_missing: bool = False) -> None:
        """
        Load metadata from a document info dictionary.
        
        Parameters:
        - docinfo (Dictionary): Document info dictionary to load from
        - delete_missing (bool): Delete existing metadata not found in docinfo
        """
    
    def save_to_docinfo(self, docinfo: Dictionary) -> None:
        """
        Save metadata to a document info dictionary.
        
        Parameters:
        - docinfo (Dictionary): Document info dictionary to update
        """
    
    @property
    def title(self) -> str:
        """
        Document title.
        
        Returns:
        str: Title of the document
        """
    
    @title.setter
    def title(self, value: str) -> None:
        """Set document title."""
    
    @property
    def author(self) -> str:
        """
        Document author.
        
        Returns:
        str: Author name or names
        """
    
    @author.setter
    def author(self, value: str) -> None:
        """Set document author."""
    
    @property
    def subject(self) -> str:
        """
        Document subject or description.
        
        Returns:
        str: Subject description
        """
    
    @subject.setter
    def subject(self, value: str) -> None:
        """Set document subject."""
    
    @property
    def keywords(self) -> str:
        """
        Document keywords.
        
        Returns:
        str: Keywords (typically comma-separated)
        """
    
    @keywords.setter
    def keywords(self, value: str) -> None:
        """Set document keywords."""
    
    @property
    def creator(self) -> str:
        """
        Application that created the original document.
        
        Returns:
        str: Name of creating application
        """
    
    @creator.setter
    def creator(self, value: str) -> None:
        """Set document creator."""
    
    @property
    def producer(self) -> str:
        """
        Application that converted/produced the PDF.
        
        Returns:
        str: Name of PDF producing application
        """
    
    @producer.setter
    def producer(self, value: str) -> None:
        """Set document producer."""
    
    @property
    def creation_date(self) -> str:
        """
        Document creation date in ISO format.
        
        Returns:
        str: Creation date (ISO 8601 format)
        """
    
    @creation_date.setter
    def creation_date(self, value: str) -> None:
        """Set document creation date."""
    
    @property
    def modification_date(self) -> str:
        """
        Document modification date in ISO format.
        
        Returns:
        str: Last modification date (ISO 8601 format)
        """
    
    @modification_date.setter
    def modification_date(self, value: str) -> None:
        """Set document modification date."""

Document Info Dictionary Access

Direct access to PDF document information dictionary for legacy metadata.

# Accessed via pdf.docinfo property
class DocumentInfo(Dictionary):
    """
    PDF document information dictionary.
    
    Legacy metadata storage using PDF's built-in document info dictionary.
    Modern documents should use XMP metadata, but this provides compatibility.
    """
    
    # Standard document info entries (accessed as dictionary keys):
    # '/Title': Document title  
    # '/Author': Document author
    # '/Subject': Document subject
    # '/Keywords': Document keywords
    # '/Creator': Creating application
    # '/Producer': PDF producer application
    # '/CreationDate': Creation date (PDF date format)
    # '/ModDate': Modification date (PDF date format)
    # '/Trapped': Trapping status (/True, /False, /Unknown)

Metadata Exceptions

Specialized exceptions for metadata operations.

class DependencyError(Exception):
    """
    Raised when required metadata processing libraries are missing.
    
    Metadata operations may require additional Python packages
    for XMP processing and date handling.
    """

Usage Examples

Basic Metadata Operations

import pikepdf
from datetime import datetime

# Open or create a PDF
pdf = pikepdf.open('document.pdf')

# Access document info dictionary (legacy metadata)
docinfo = pdf.docinfo

# Read existing metadata
print("Current metadata:")
print(f"Title: {docinfo.get('/Title', 'No title')}")
print(f"Author: {docinfo.get('/Author', 'No author')}")
print(f"Subject: {docinfo.get('/Subject', 'No subject')}")
print(f"Keywords: {docinfo.get('/Keywords', 'No keywords')}")
print(f"Creator: {docinfo.get('/Creator', 'No creator')}")
print(f"Producer: {docinfo.get('/Producer', 'No producer')}")

# Update metadata
docinfo['/Title'] = pikepdf.String('Updated Document Title')
docinfo['/Author'] = pikepdf.String('Jane Doe')
docinfo['/Subject'] = pikepdf.String('Technical Documentation')
docinfo['/Keywords'] = pikepdf.String('PDF, documentation, technical, guide')
docinfo['/Creator'] = pikepdf.String('Python Script')

# Set creation and modification dates
current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")
docinfo['/CreationDate'] = pikepdf.String(current_date)
docinfo['/ModDate'] = pikepdf.String(current_date)

pdf.save('updated_metadata.pdf')
pdf.close()

Working with XMP Metadata

import pikepdf
from datetime import datetime

# Open PDF and access XMP metadata
pdf = pikepdf.open('document.pdf')

try:
    # Create XMP metadata handler
    metadata = pikepdf.PdfMetadata(pdf)
    
    print("XMP Metadata:")
    print(f"Title: {metadata.title}")
    print(f"Author: {metadata.author}")
    print(f"Subject: {metadata.subject}")
    print(f"Keywords: {metadata.keywords}")
    print(f"Creator: {metadata.creator}")
    print(f"Producer: {metadata.producer}")
    print(f"Creation Date: {metadata.creation_date}")
    print(f"Modification Date: {metadata.modification_date}")
    print(f"PDF/A Status: {metadata.pdfa_status}")
    
    # Update XMP metadata
    metadata.title = "Comprehensive PDF Guide"
    metadata.author = "Technical Writing Team"
    metadata.subject = "Complete guide to PDF operations using pikepdf"
    metadata.keywords = "PDF, Python, pikepdf, documentation, tutorial"
    metadata.creator = "Python Documentation Generator"
    
    # Set dates in ISO format
    now = datetime.now().isoformat()
    metadata.creation_date = now
    metadata.modification_date = now
    
    # Synchronize XMP with document info
    metadata.save_to_docinfo(pdf.docinfo)
    
    pdf.save('xmp_updated.pdf')
    print("XMP metadata updated successfully")
    
except pikepdf.DependencyError:
    print("XMP processing libraries not available - using basic metadata only")
    
    # Fall back to basic document info
    docinfo = pdf.docinfo
    docinfo['/Title'] = pikepdf.String("Comprehensive PDF Guide")
    docinfo['/Author'] = pikepdf.String("Technical Writing Team") 
    pdf.save('basic_metadata_updated.pdf')

pdf.close()

PDF/A Compliance and Metadata

import pikepdf
from datetime import datetime

def create_pdfa_compliant_document():
    """Create a PDF/A compliant document with proper metadata."""
    
    pdf = pikepdf.new()
    page = pdf.add_blank_page()
    
    # Add minimal content
    content = """
    BT
    /F1 12 Tf
    100 700 Td
    (PDF/A Compliant Document) Tj
    ET
    """
    content_stream = pikepdf.Stream(pdf, content.encode())
    page['/Contents'] = content_stream
    
    try:
        # Set up XMP metadata for PDF/A compliance
        metadata = pikepdf.PdfMetadata(pdf)
        
        # Required metadata for PDF/A
        metadata.title = "PDF/A Compliant Document"
        metadata.author = "Document Generator"
        metadata.subject = "Sample PDF/A document with complete metadata"
        metadata.keywords = "PDF/A, compliance, archival, standard"
        metadata.creator = "Python pikepdf library"
        metadata.producer = f"pikepdf {pikepdf.__version__}"
        
        # Set required dates
        now = datetime.now().isoformat()
        metadata.creation_date = now
        metadata.modification_date = now
        
        # Synchronize with document info
        metadata.save_to_docinfo(pdf.docinfo)
        
        # Additional PDF/A requirements would include:
        # - Embedded fonts
        # - Color profile
        # - Proper XMP packet
        # - No encryption
        # - No external dependencies
        
        pdf.save('pdfa_compliant.pdf')
        print(f"Created PDF/A compliant document with metadata")
        print(f"PDF/A Status: {metadata.pdfa_status}")
        
    except pikepdf.DependencyError:
        print("XMP libraries not available - cannot create full PDF/A compliance")
    
    pdf.close()

create_pdfa_compliant_document()

Metadata Analysis and Reporting

import pikepdf
from pathlib import Path
from datetime import datetime

def analyze_pdf_metadata(pdf_path):
    """Analyze metadata in a PDF file."""
    
    try:
        pdf = pikepdf.open(pdf_path)
        analysis = {
            'file': str(pdf_path),
            'file_size': pdf_path.stat().st_size,
            'pages': len(pdf.pages),
            'pdf_version': pdf.pdf_version,
            'is_encrypted': pdf.is_encrypted
        }
        
        # Document info metadata
        docinfo = pdf.docinfo
        analysis['docinfo'] = {
            'title': str(docinfo.get('/Title', '')),
            'author': str(docinfo.get('/Author', '')),
            'subject': str(docinfo.get('/Subject', '')),
            'keywords': str(docinfo.get('/Keywords', '')),
            'creator': str(docinfo.get('/Creator', '')),
            'producer': str(docinfo.get('/Producer', '')),
            'creation_date': str(docinfo.get('/CreationDate', '')),
            'modification_date': str(docinfo.get('/ModDate', '')),
            'trapped': str(docinfo.get('/Trapped', ''))
        }
        
        # Try XMP metadata
        try:
            metadata = pikepdf.PdfMetadata(pdf)
            analysis['xmp'] = {
                'title': metadata.title,
                'author': metadata.author,
                'subject': metadata.subject,
                'keywords': metadata.keywords,
                'creator': metadata.creator,
                'producer': metadata.producer,
                'creation_date': metadata.creation_date,
                'modification_date': metadata.modification_date,
                'pdfa_status': metadata.pdfa_status
            }
            analysis['has_xmp'] = True
        except pikepdf.DependencyError:
            analysis['has_xmp'] = False
            analysis['xmp_error'] = "XMP libraries not available"
        except Exception as e:
            analysis['has_xmp'] = False
            analysis['xmp_error'] = str(e)
        
        pdf.close()
        return analysis
        
    except Exception as e:
        return {'file': str(pdf_path), 'error': str(e)}

def metadata_report(directory_path):
    """Generate a comprehensive metadata report for PDFs in a directory."""
    
    directory = Path(directory_path)
    pdf_files = list(directory.glob('*.pdf'))
    
    print(f"PDF Metadata Report for: {directory}")
    print("=" * 80)
    
    for pdf_file in pdf_files:
        analysis = analyze_pdf_metadata(pdf_file)
        
        if 'error' in analysis:
            print(f"\n❌ {pdf_file.name}: {analysis['error']}")
            continue
        
        print(f"\n📄 {pdf_file.name}")
        print(f"   Size: {analysis['file_size']:,} bytes, "
              f"Pages: {analysis['pages']}, "
              f"Version: {analysis['pdf_version']}")
        
        if analysis['is_encrypted']:
            print(f"   🔒 ENCRYPTED")
        
        # Document Info metadata
        docinfo = analysis['docinfo']
        if any(docinfo.values()):
            print(f"   Document Info:")
            if docinfo['title']: print(f"     Title: {docinfo['title']}")
            if docinfo['author']: print(f"     Author: {docinfo['author']}")
            if docinfo['creator']: print(f"     Creator: {docinfo['creator']}")
            if docinfo['producer']: print(f"     Producer: {docinfo['producer']}")
            if docinfo['creation_date']: print(f"     Created: {docinfo['creation_date']}")
            if docinfo['modification_date']: print(f"     Modified: {docinfo['modification_date']}")
        else:
            print(f"   📋 No Document Info metadata")
        
        # XMP metadata
        if analysis['has_xmp']:
            xmp = analysis['xmp']
            if any([xmp['title'], xmp['author'], xmp['subject']]):
                print(f"   XMP Metadata:")
                if xmp['title']: print(f"     Title: {xmp['title']}")
                if xmp['author']: print(f"     Author: {xmp['author']}")
                if xmp['subject']: print(f"     Subject: {xmp['subject']}")
                if xmp['pdfa_status']: print(f"     PDF/A: {xmp['pdfa_status']}")
            else:
                print(f"   📋 XMP present but minimal")
        elif 'xmp_error' in analysis:
            print(f"   ⚠️  XMP: {analysis['xmp_error']}")

# Generate metadata report
# metadata_report('.')

Batch Metadata Operations

import pikepdf
from pathlib import Path
from datetime import datetime

def standardize_metadata(directory_path, template_metadata):
    """Standardize metadata across multiple PDF files."""
    
    directory = Path(directory_path)
    pdf_files = list(directory.glob('*.pdf'))
    results = {'updated': [], 'failed': [], 'skipped': []}
    
    for pdf_file in pdf_files:
        try:
            # Skip encrypted files
            pdf = pikepdf.open(pdf_file)
            if pdf.is_encrypted:
                results['skipped'].append((str(pdf_file), "Encrypted"))
                pdf.close()
                continue
            
            # Update document info
            docinfo = pdf.docinfo
            
            # Apply template metadata
            if template_metadata.get('author'):
                docinfo['/Author'] = pikepdf.String(template_metadata['author'])
            if template_metadata.get('creator'):
                docinfo['/Creator'] = pikepdf.String(template_metadata['creator'])
            if template_metadata.get('producer'):
                docinfo['/Producer'] = pikepdf.String(template_metadata['producer'])
            
            # Update modification date
            current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")
            docinfo['/ModDate'] = pikepdf.String(current_date)
            
            # Preserve existing title if present, otherwise use filename
            if not docinfo.get('/Title'):
                title = pdf_file.stem.replace('_', ' ').replace('-', ' ').title()
                docinfo['/Title'] = pikepdf.String(title)
            
            # Try XMP update if available
            try:
                metadata = pikepdf.PdfMetadata(pdf)
                if template_metadata.get('author'):
                    metadata.author = template_metadata['author']
                if template_metadata.get('creator'):
                    metadata.creator = template_metadata['creator']
                if template_metadata.get('producer'):
                    metadata.producer = template_metadata['producer']
                
                metadata.modification_date = datetime.now().isoformat()
                metadata.save_to_docinfo(docinfo)
            except pikepdf.DependencyError:
                pass  # XMP not available, document info is sufficient
            
            # Save changes
            pdf.save()
            pdf.close()
            results['updated'].append(str(pdf_file))
            
        except Exception as e:
            results['failed'].append((str(pdf_file), str(e)))
            try:
                pdf.close()
            except:
                pass
    
    print(f"Metadata standardization complete:")
    print(f"  Updated: {len(results['updated'])} files")
    print(f"  Failed: {len(results['failed'])} files") 
    print(f"  Skipped: {len(results['skipped'])} files")
    
    return results

# Standardize metadata with template
template = {
    'author': 'Corporate Documentation Team',
    'creator': 'Document Management System',
    'producer': f'pikepdf {pikepdf.__version__}'
}

# results = standardize_metadata('.', template)

Custom Metadata Fields

import pikepdf

def add_custom_metadata(pdf_path, custom_fields):
    """Add custom metadata fields to a PDF."""
    
    pdf = pikepdf.open(pdf_path)
    docinfo = pdf.docinfo
    
    # Add custom fields to document info
    for field_name, field_value in custom_fields.items():
        # Custom fields should use proper PDF name format
        pdf_field_name = f'/{field_name}'
        docinfo[pdf_field_name] = pikepdf.String(str(field_value))
    
    # Also try to add to XMP if available
    try:
        metadata = pikepdf.PdfMetadata(pdf)
        
        # Custom XMP properties would require namespace registration
        # For basic use, document info is sufficient
        metadata.save_to_docinfo(docinfo)
        
    except pikepdf.DependencyError:
        pass
    
    pdf.save()
    pdf.close()
    print(f"Added custom metadata to {pdf_path}")

# Add custom metadata
custom_metadata = {
    'Department': 'Engineering',
    'Project': 'API Documentation',
    'Version': '2.1.0',
    'Status': 'Final',
    'ReviewedBy': 'Technical Lead',
    'ApprovalDate': '2024-09-10',
    'DocumentID': 'DOC-2024-001',
    'SecurityClass': 'Internal'
}

# add_custom_metadata('document.pdf', custom_metadata)

def extract_custom_metadata(pdf_path):
    """Extract and display all metadata including custom fields."""
    
    pdf = pikepdf.open(pdf_path)
    docinfo = pdf.docinfo
    
    print(f"All metadata for: {pdf_path}")
    print("=" * 50)
    
    # Standard fields
    standard_fields = ['/Title', '/Author', '/Subject', '/Keywords', 
                      '/Creator', '/Producer', '/CreationDate', '/ModDate', '/Trapped']
    
    print("Standard Fields:")
    for field in standard_fields:
        if field in docinfo:
            print(f"  {field[1:]}: {docinfo[field]}")
    
    # Custom fields (anything not in standard list)
    custom_fields = [key for key in docinfo.keys() if key not in standard_fields]
    
    if custom_fields:
        print("\nCustom Fields:")
        for field in custom_fields:
            print(f"  {field[1:]}: {docinfo[field]}")
    else:
        print("\nNo custom fields found")
    
    pdf.close()

# Extract all metadata including custom fields  
# extract_custom_metadata('document.pdf')

Install with Tessl CLI