tessl/pypi-pypdf

A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Metadata

Name: tessl/pypi-pypdf
Author: tessl

Access and manipulation of PDF metadata, document properties, XMP information, and custom document attributes. pypdf provides comprehensive metadata handling for both reading existing information and setting new properties.

Capabilities

Document Information

The DocumentInformation class provides access to standard PDF metadata fields with both processed and raw value access.

class DocumentInformation:
    @property
    def title(self) -> str | None:
        """Get the document title (processed)."""
        
    @property
    def title_raw(self) -> str | None:
        """Get the raw document title."""

    @property
    def author(self) -> str | None:
        """Get the document author (processed)."""
        
    @property
    def author_raw(self) -> str | None:
        """Get the raw document author."""

    @property
    def subject(self) -> str | None:
        """Get the document subject (processed)."""
        
    @property
    def subject_raw(self) -> str | None:
        """Get the raw document subject."""

    @property
    def creator(self) -> str | None:
        """Get the creating application (processed)."""
        
    @property
    def creator_raw(self) -> str | None:
        """Get the raw creating application."""

    @property
    def producer(self) -> str | None:
        """Get the PDF producer (processed)."""
        
    @property
    def producer_raw(self) -> str | None:
        """Get the raw PDF producer."""

    @property
    def creation_date(self) -> datetime | None:
        """Get the creation date as datetime object."""
        
    @property
    def creation_date_raw(self) -> str | None:
        """Get the raw creation date string."""

    @property
    def modification_date(self) -> datetime | None:
        """Get the modification date as datetime object."""
        
    @property
    def modification_date_raw(self) -> str | None:
        """Get the raw modification date string."""

    @property
    def keywords(self) -> str | None:
        """Get the document keywords (processed)."""
        
    @property
    def keywords_raw(self) -> str | None:
        """Get the raw document keywords."""

XMP Metadata

Extended metadata support through XMP (Extensible Metadata Platform) for advanced metadata handling.

class XmpInformation:
    """XMP metadata information class for advanced metadata handling."""
    
    def get_element(self, about_uri: str, namespace: str, name: str):
        """
        Get an XMP metadata element.
        
        Args:
            about_uri: URI identifying the resource
            namespace: XML namespace
            name: Element name
            
        Returns:
            Element value or None
        """

    def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> list:
        """
        Get all nodes in a specific namespace.
        
        Args:
            about_uri: URI identifying the resource
            namespace: XML namespace
            
        Returns:
            List of nodes in the namespace
        """

Usage Examples

Reading Basic Metadata

from pypdf import PdfReader

reader = PdfReader("document.pdf")
metadata = reader.metadata

if metadata:
    print(f"Title: {metadata.title}")
    print(f"Author: {metadata.author}")
    print(f"Subject: {metadata.subject}")
    print(f"Creator: {metadata.creator}")
    print(f"Producer: {metadata.producer}")
    print(f"Creation Date: {metadata.creation_date}")
    print(f"Modification Date: {metadata.modification_date}")
    print(f"Keywords: {metadata.keywords}")
else:
    print("No metadata available")

Reading Raw Metadata

from pypdf import PdfReader

reader = PdfReader("document.pdf")
metadata = reader.metadata

if metadata:
    # Compare processed vs raw values
    print("Processed values:")
    print(f"  Title: {metadata.title}")
    print(f"  Author: {metadata.author}")
    
    print("\nRaw values:")
    print(f"  Title: {metadata.title_raw}")
    print(f"  Author: {metadata.author_raw}")

Writing Metadata

from pypdf import PdfReader, PdfWriter
from datetime import datetime

reader = PdfReader("input.pdf")
writer = PdfWriter()

# Copy all pages
for page in reader.pages:
    writer.add_page(page)

# Set metadata
writer.add_metadata({
    "/Title": "Updated Document Title",
    "/Author": "John Doe",
    "/Subject": "Updated document subject",
    "/Creator": "My Application",
    "/Producer": "pypdf",
    "/Keywords": "PDF, metadata, pypdf",
    "/CreationDate": datetime.now(),
    "/ModDate": datetime.now()
})

with open("output_with_metadata.pdf", "wb") as output:
    writer.write(output)

Copying and Modifying Metadata

from pypdf import PdfReader, PdfWriter
from datetime import datetime

reader = PdfReader("input.pdf")
writer = PdfWriter()

# Copy pages
for page in reader.pages:
    writer.add_page(page)

# Get existing metadata
existing_metadata = reader.metadata

# Create updated metadata dictionary
new_metadata = {}
if existing_metadata:
    # Copy existing metadata
    if existing_metadata.title:
        new_metadata["/Title"] = existing_metadata.title
    if existing_metadata.author:
        new_metadata["/Author"] = existing_metadata.author
    if existing_metadata.subject:
        new_metadata["/Subject"] = existing_metadata.subject
    if existing_metadata.creator:
        new_metadata["/Creator"] = existing_metadata.creator
    if existing_metadata.keywords:
        new_metadata["/Keywords"] = existing_metadata.keywords

# Update specific fields
new_metadata["/Producer"] = "pypdf 6.0.0"
new_metadata["/ModDate"] = datetime.now()

# Add custom metadata
new_metadata["/Custom"] = "Custom metadata value"

writer.add_metadata(new_metadata)

with open("updated_metadata.pdf", "wb") as output:
    writer.write(output)

Working with XMP Metadata

from pypdf import PdfReader

reader = PdfReader("document_with_xmp.pdf")

# Check if XMP metadata exists
if reader.xmp_metadata:
    print("XMP metadata found")
    
    # Get Dublin Core elements
    dc_namespace = "http://purl.org/dc/elements/1.1/"
    about_uri = ""
    
    try:
        title = reader.xmp_metadata.get_element(about_uri, dc_namespace, "title")
        creator = reader.xmp_metadata.get_element(about_uri, dc_namespace, "creator")
        description = reader.xmp_metadata.get_element(about_uri, dc_namespace, "description")
        
        print(f"DC Title: {title}")
        print(f"DC Creator: {creator}")
        print(f"DC Description: {description}")
        
    except Exception as e:
        print(f"Error reading XMP metadata: {e}")
        
else:
    print("No XMP metadata found")

Metadata Extraction Report

from pypdf import PdfReader
from datetime import datetime
import json

def extract_metadata_report(pdf_path: str) -> dict:
    """
    Extract comprehensive metadata report from a PDF.
    
    Args:
        pdf_path: Path to PDF file
        
    Returns:
        Dictionary containing all metadata information
    """
    report = {
        "file_path": pdf_path,
        "extraction_time": datetime.now().isoformat(),
        "basic_metadata": {},
        "raw_metadata": {},
        "xmp_metadata": {},
        "document_info": {}
    }
    
    try:
        reader = PdfReader(pdf_path)
        
        # Basic document information
        report["document_info"] = {
            "page_count": len(reader.pages),
            "is_encrypted": reader.is_encrypted,
            "pdf_header": reader.pdf_header
        }
        
        # Standard metadata
        if reader.metadata:
            metadata = reader.metadata
            
            # Processed metadata
            report["basic_metadata"] = {
                "title": metadata.title,
                "author": metadata.author,
                "subject": metadata.subject,
                "creator": metadata.creator,
                "producer": metadata.producer,
                "creation_date": metadata.creation_date.isoformat() if metadata.creation_date else None,
                "modification_date": metadata.modification_date.isoformat() if metadata.modification_date else None,
                "keywords": metadata.keywords
            }
            
            # Raw metadata
            report["raw_metadata"] = {
                "title_raw": metadata.title_raw,
                "author_raw": metadata.author_raw,
                "subject_raw": metadata.subject_raw,
                "creator_raw": metadata.creator_raw,
                "producer_raw": metadata.producer_raw,
                "creation_date_raw": metadata.creation_date_raw,
                "modification_date_raw": metadata.modification_date_raw,
                "keywords_raw": metadata.keywords_raw
            }
        
        # XMP metadata
        if reader.xmp_metadata:
            report["xmp_metadata"]["present"] = True
            # Note: XMP parsing would require more specific implementation
            # based on the actual XMP structure in the document
        else:
            report["xmp_metadata"]["present"] = False
            
    except Exception as e:
        report["error"] = str(e)
    
    return report

# Generate metadata report
report = extract_metadata_report("document.pdf")
print(json.dumps(report, indent=2))

Batch Metadata Processing

from pypdf import PdfReader, PdfWriter
from pathlib import Path
import csv
from datetime import datetime

def extract_metadata_to_csv(pdf_directory: str, csv_output: str):
    """
    Extract metadata from all PDFs in a directory to CSV.
    
    Args:
        pdf_directory: Directory containing PDF files
        csv_output: Output CSV file path
    """
    
    metadata_records = []
    
    for pdf_path in Path(pdf_directory).glob("*.pdf"):
        try:
            reader = PdfReader(str(pdf_path))
            metadata = reader.metadata
            
            record = {
                "filename": pdf_path.name,
                "title": metadata.title if metadata else "",
                "author": metadata.author if metadata else "",
                "subject": metadata.subject if metadata else "",
                "creator": metadata.creator if metadata else "",
                "producer": metadata.producer if metadata else "",
                "creation_date": metadata.creation_date if metadata else "",
                "modification_date": metadata.modification_date if metadata else "",
                "keywords": metadata.keywords if metadata else "",
                "page_count": len(reader.pages),
                "is_encrypted": reader.is_encrypted,
                "pdf_version": reader.pdf_header
            }
            
            metadata_records.append(record)
            
        except Exception as e:
            print(f"Error processing {pdf_path.name}: {e}")
    
    # Write to CSV
    if metadata_records:
        with open(csv_output, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = metadata_records[0].keys()
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            for record in metadata_records:
                writer.writerow(record)
        
        print(f"Metadata extracted to {csv_output}")
        print(f"Processed {len(metadata_records)} PDF files")

# Extract metadata from all PDFs to CSV
extract_metadata_to_csv("pdf_collection/", "pdf_metadata.csv")

Setting Custom Metadata Fields

from pypdf import PdfReader, PdfWriter
from datetime import datetime

reader = PdfReader("input.pdf")
writer = PdfWriter()

# Copy pages
for page in reader.pages:
    writer.add_page(page)

# Set comprehensive metadata with custom fields
metadata = {
    # Standard fields
    "/Title": "My Document",
    "/Author": "Jane Smith",
    "/Subject": "Important Document",
    "/Creator": "My Application v2.0",
    "/Producer": "pypdf 6.0.0",
    "/Keywords": "important, document, processing",
    "/CreationDate": datetime.now(),
    "/ModDate": datetime.now(),
    
    # Custom fields
    "/Department": "Engineering",
    "/ProjectCode": "PROJ-2024-001",
    "/Classification": "Internal",
    "/ReviewDate": datetime(2024, 12, 31),
    "/Version": "1.0",
    "/ApprovedBy": "Manager Name"
}

writer.add_metadata(metadata)

with open("document_with_custom_metadata.pdf", "wb") as output:
    writer.write(output)

Install with Tessl CLI