A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Access and manipulation of PDF metadata, document properties, XMP information, and custom document attributes. pypdf provides comprehensive metadata handling for both reading existing information and setting new properties.
The DocumentInformation class provides access to standard PDF metadata fields with both processed and raw value access.
class DocumentInformation:
@property
def title(self) -> str | None:
"""Get the document title (processed)."""
@property
def title_raw(self) -> str | None:
"""Get the raw document title."""
@property
def author(self) -> str | None:
"""Get the document author (processed)."""
@property
def author_raw(self) -> str | None:
"""Get the raw document author."""
@property
def subject(self) -> str | None:
"""Get the document subject (processed)."""
@property
def subject_raw(self) -> str | None:
"""Get the raw document subject."""
@property
def creator(self) -> str | None:
"""Get the creating application (processed)."""
@property
def creator_raw(self) -> str | None:
"""Get the raw creating application."""
@property
def producer(self) -> str | None:
"""Get the PDF producer (processed)."""
@property
def producer_raw(self) -> str | None:
"""Get the raw PDF producer."""
@property
def creation_date(self) -> datetime | None:
"""Get the creation date as datetime object."""
@property
def creation_date_raw(self) -> str | None:
"""Get the raw creation date string."""
@property
def modification_date(self) -> datetime | None:
"""Get the modification date as datetime object."""
@property
def modification_date_raw(self) -> str | None:
"""Get the raw modification date string."""
@property
def keywords(self) -> str | None:
"""Get the document keywords (processed)."""
@property
def keywords_raw(self) -> str | None:
"""Get the raw document keywords."""Extended metadata support through XMP (Extensible Metadata Platform) for advanced metadata handling.
class XmpInformation:
"""XMP metadata information class for advanced metadata handling."""
def get_element(self, about_uri: str, namespace: str, name: str):
"""
Get an XMP metadata element.
Args:
about_uri: URI identifying the resource
namespace: XML namespace
name: Element name
Returns:
Element value or None
"""
def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> list:
"""
Get all nodes in a specific namespace.
Args:
about_uri: URI identifying the resource
namespace: XML namespace
Returns:
List of nodes in the namespace
"""from pypdf import PdfReader
reader = PdfReader("document.pdf")
metadata = reader.metadata
if metadata:
print(f"Title: {metadata.title}")
print(f"Author: {metadata.author}")
print(f"Subject: {metadata.subject}")
print(f"Creator: {metadata.creator}")
print(f"Producer: {metadata.producer}")
print(f"Creation Date: {metadata.creation_date}")
print(f"Modification Date: {metadata.modification_date}")
print(f"Keywords: {metadata.keywords}")
else:
print("No metadata available")from pypdf import PdfReader
reader = PdfReader("document.pdf")
metadata = reader.metadata
if metadata:
# Compare processed vs raw values
print("Processed values:")
print(f" Title: {metadata.title}")
print(f" Author: {metadata.author}")
print("\nRaw values:")
print(f" Title: {metadata.title_raw}")
print(f" Author: {metadata.author_raw}")from pypdf import PdfReader, PdfWriter
from datetime import datetime
reader = PdfReader("input.pdf")
writer = PdfWriter()
# Copy all pages
for page in reader.pages:
writer.add_page(page)
# Set metadata
writer.add_metadata({
"/Title": "Updated Document Title",
"/Author": "John Doe",
"/Subject": "Updated document subject",
"/Creator": "My Application",
"/Producer": "pypdf",
"/Keywords": "PDF, metadata, pypdf",
"/CreationDate": datetime.now(),
"/ModDate": datetime.now()
})
with open("output_with_metadata.pdf", "wb") as output:
writer.write(output)from pypdf import PdfReader, PdfWriter
from datetime import datetime
reader = PdfReader("input.pdf")
writer = PdfWriter()
# Copy pages
for page in reader.pages:
writer.add_page(page)
# Get existing metadata
existing_metadata = reader.metadata
# Create updated metadata dictionary
new_metadata = {}
if existing_metadata:
# Copy existing metadata
if existing_metadata.title:
new_metadata["/Title"] = existing_metadata.title
if existing_metadata.author:
new_metadata["/Author"] = existing_metadata.author
if existing_metadata.subject:
new_metadata["/Subject"] = existing_metadata.subject
if existing_metadata.creator:
new_metadata["/Creator"] = existing_metadata.creator
if existing_metadata.keywords:
new_metadata["/Keywords"] = existing_metadata.keywords
# Update specific fields
new_metadata["/Producer"] = "pypdf 6.0.0"
new_metadata["/ModDate"] = datetime.now()
# Add custom metadata
new_metadata["/Custom"] = "Custom metadata value"
writer.add_metadata(new_metadata)
with open("updated_metadata.pdf", "wb") as output:
writer.write(output)from pypdf import PdfReader
reader = PdfReader("document_with_xmp.pdf")
# Check if XMP metadata exists
if reader.xmp_metadata:
print("XMP metadata found")
# Get Dublin Core elements
dc_namespace = "http://purl.org/dc/elements/1.1/"
about_uri = ""
try:
title = reader.xmp_metadata.get_element(about_uri, dc_namespace, "title")
creator = reader.xmp_metadata.get_element(about_uri, dc_namespace, "creator")
description = reader.xmp_metadata.get_element(about_uri, dc_namespace, "description")
print(f"DC Title: {title}")
print(f"DC Creator: {creator}")
print(f"DC Description: {description}")
except Exception as e:
print(f"Error reading XMP metadata: {e}")
else:
print("No XMP metadata found")from pypdf import PdfReader
from datetime import datetime
import json
def extract_metadata_report(pdf_path: str) -> dict:
"""
Extract comprehensive metadata report from a PDF.
Args:
pdf_path: Path to PDF file
Returns:
Dictionary containing all metadata information
"""
report = {
"file_path": pdf_path,
"extraction_time": datetime.now().isoformat(),
"basic_metadata": {},
"raw_metadata": {},
"xmp_metadata": {},
"document_info": {}
}
try:
reader = PdfReader(pdf_path)
# Basic document information
report["document_info"] = {
"page_count": len(reader.pages),
"is_encrypted": reader.is_encrypted,
"pdf_header": reader.pdf_header
}
# Standard metadata
if reader.metadata:
metadata = reader.metadata
# Processed metadata
report["basic_metadata"] = {
"title": metadata.title,
"author": metadata.author,
"subject": metadata.subject,
"creator": metadata.creator,
"producer": metadata.producer,
"creation_date": metadata.creation_date.isoformat() if metadata.creation_date else None,
"modification_date": metadata.modification_date.isoformat() if metadata.modification_date else None,
"keywords": metadata.keywords
}
# Raw metadata
report["raw_metadata"] = {
"title_raw": metadata.title_raw,
"author_raw": metadata.author_raw,
"subject_raw": metadata.subject_raw,
"creator_raw": metadata.creator_raw,
"producer_raw": metadata.producer_raw,
"creation_date_raw": metadata.creation_date_raw,
"modification_date_raw": metadata.modification_date_raw,
"keywords_raw": metadata.keywords_raw
}
# XMP metadata
if reader.xmp_metadata:
report["xmp_metadata"]["present"] = True
# Note: XMP parsing would require more specific implementation
# based on the actual XMP structure in the document
else:
report["xmp_metadata"]["present"] = False
except Exception as e:
report["error"] = str(e)
return report
# Generate metadata report
report = extract_metadata_report("document.pdf")
print(json.dumps(report, indent=2))from pypdf import PdfReader, PdfWriter
from pathlib import Path
import csv
from datetime import datetime
def extract_metadata_to_csv(pdf_directory: str, csv_output: str):
"""
Extract metadata from all PDFs in a directory to CSV.
Args:
pdf_directory: Directory containing PDF files
csv_output: Output CSV file path
"""
metadata_records = []
for pdf_path in Path(pdf_directory).glob("*.pdf"):
try:
reader = PdfReader(str(pdf_path))
metadata = reader.metadata
record = {
"filename": pdf_path.name,
"title": metadata.title if metadata else "",
"author": metadata.author if metadata else "",
"subject": metadata.subject if metadata else "",
"creator": metadata.creator if metadata else "",
"producer": metadata.producer if metadata else "",
"creation_date": metadata.creation_date if metadata else "",
"modification_date": metadata.modification_date if metadata else "",
"keywords": metadata.keywords if metadata else "",
"page_count": len(reader.pages),
"is_encrypted": reader.is_encrypted,
"pdf_version": reader.pdf_header
}
metadata_records.append(record)
except Exception as e:
print(f"Error processing {pdf_path.name}: {e}")
# Write to CSV
if metadata_records:
with open(csv_output, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = metadata_records[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for record in metadata_records:
writer.writerow(record)
print(f"Metadata extracted to {csv_output}")
print(f"Processed {len(metadata_records)} PDF files")
# Extract metadata from all PDFs to CSV
extract_metadata_to_csv("pdf_collection/", "pdf_metadata.csv")from pypdf import PdfReader, PdfWriter
from datetime import datetime
reader = PdfReader("input.pdf")
writer = PdfWriter()
# Copy pages
for page in reader.pages:
writer.add_page(page)
# Set comprehensive metadata with custom fields
metadata = {
# Standard fields
"/Title": "My Document",
"/Author": "Jane Smith",
"/Subject": "Important Document",
"/Creator": "My Application v2.0",
"/Producer": "pypdf 6.0.0",
"/Keywords": "important, document, processing",
"/CreationDate": datetime.now(),
"/ModDate": datetime.now(),
# Custom fields
"/Department": "Engineering",
"/ProjectCode": "PROJ-2024-001",
"/Classification": "Internal",
"/ReviewDate": datetime(2024, 12, 31),
"/Version": "1.0",
"/ApprovedBy": "Manager Name"
}
writer.add_metadata(metadata)
with open("document_with_custom_metadata.pdf", "wb") as output:
writer.write(output)Install with Tessl CLI
npx tessl i tessl/pypi-pypdf