Read and write PDFs with Python, powered by qpdf
—
Embedded file management including attachment, extraction, and metadata handling for portfolio PDFs and file attachments. These capabilities enable comprehensive file embedding and management within PDF documents.
Individual file attachment specifications with metadata and content management.
class AttachedFileSpec:
"""
PDF attached file specification for embedded files.
Represents a single file embedded within a PDF document,
including its content, metadata, and relationship to the document.
"""
@staticmethod
def from_filepath(pdf: Pdf, path: str, *, description: str = '',
relationship: str = '/Unspecified') -> AttachedFileSpec:
"""
Create an attached file specification from a file path.
Reads the file from disk and creates a complete attachment
specification with appropriate metadata and content encoding.
Parameters:
- pdf (Pdf): PDF document to attach the file to
- path (str): Path to the file to attach
- description (str): Human-readable description of the file
- relationship (str): Relationship to the document
('/Source', '/Data', '/Alternative', '/Supplement', '/Unspecified')
Returns:
AttachedFileSpec: Attached file specification ready for embedding
Raises:
FileNotFoundError: If the specified file doesn't exist
IOError: If the file cannot be read
"""
def get_file(self) -> bytes:
"""
Retrieve the attached file's content as bytes.
Extracts and decodes the embedded file data from the PDF.
Returns:
bytes: Complete file content
Raises:
DataDecodingError: If file data cannot be decoded
"""
def get_all_filenames(self) -> dict[str, str]:
"""
Get all filename variants for this attachment.
PDF attachments can have multiple filename variants for
different platforms and character encodings.
Returns:
dict[str, str]: Mapping of filename types to actual filenames
Keys: 'F', 'UF', 'DOS', 'Mac', 'Unix'
"""
@property
def filename(self) -> str:
"""
Primary filename for the attached file.
Returns the most appropriate filename, preferring Unicode
filenames when available.
Returns:
str: Filename of the attached file
"""
@property
def description(self) -> str:
"""
Human-readable description of the attached file.
Returns:
str: File description or empty string if none provided
"""
@property
def relationship(self) -> str:
"""
Relationship of this file to the PDF document.
Common values:
- '/Source': Original source file for the PDF
- '/Data': Data file related to the PDF content
- '/Alternative': Alternative representation
- '/Supplement': Supplementary file
- '/Unspecified': Relationship not specified
Returns:
str: Relationship type as PDF name
"""
@property
def size(self) -> int:
"""
Size of the attached file in bytes.
Returns:
int: File size, or -1 if size is unknown
"""
@property
def creation_date(self) -> str:
"""
Creation date of the attached file.
Returns:
str: Creation date in PDF date format, or empty if unknown
"""
@property
def modification_date(self) -> str:
"""
Last modification date of the attached file.
Returns:
str: Modification date in PDF date format, or empty if unknown
"""
@property
def checksum(self) -> str:
"""
MD5 checksum of the attached file content.
Used for integrity verification of the embedded file.
Returns:
str: Hex-encoded MD5 hash, or empty if not available
"""Collection interface for managing all attachments in a PDF document.
class Attachments:
"""
Mapping interface for PDF attachments collection.
Provides dictionary-like access to all embedded files in a PDF,
with methods for adding, removing, and iterating attachments.
Implements MutableMapping[str, AttachedFileSpec] interface.
"""
def __len__(self) -> int:
"""
Number of attached files in the PDF.
Returns:
int: Count of embedded files
"""
def __iter__(self) -> Iterator[str]:
"""
Iterate over attachment names.
Yields:
str: Filename/key for each attached file
"""
def __getitem__(self, key: str) -> AttachedFileSpec:
"""
Get an attached file by name.
Parameters:
- key (str): Attachment filename or key
Returns:
AttachedFileSpec: Attached file specification
Raises:
KeyError: If attachment with specified key doesn't exist
"""
def __setitem__(self, key: str, value: AttachedFileSpec) -> None:
"""
Add or replace an attached file.
Parameters:
- key (str): Attachment name/key
- value (AttachedFileSpec): File specification to attach
"""
def __delitem__(self, key: str) -> None:
"""
Remove an attached file.
Parameters:
- key (str): Attachment name/key to remove
Raises:
KeyError: If attachment doesn't exist
"""
def __contains__(self, key: str) -> bool:
"""
Check if an attachment exists.
Parameters:
- key (str): Attachment name/key to check
Returns:
bool: True if attachment exists
"""
def keys(self):
"""
Get all attachment names.
Returns:
KeysView: View of all attachment keys
"""
def values(self):
"""
Get all attachment specifications.
Returns:
ValuesView: View of all AttachedFileSpec objects
"""
def items(self):
"""
Get all attachment name-specification pairs.
Returns:
ItemsView: View of (key, AttachedFileSpec) pairs
"""
def clear(self) -> None:
"""Remove all attachments from the PDF."""import pikepdf
from pathlib import Path
# Open or create a PDF
pdf = pikepdf.open('document.pdf')
# Access the attachments collection
attachments = pdf.attachments
# Attach a file from disk
document_file = Path('source_document.docx')
if document_file.exists():
# Create attachment specification
attachment = pikepdf.AttachedFileSpec.from_filepath(
pdf,
str(document_file),
description="Original Word document source",
relationship='/Source'
)
# Add to PDF
attachments['source_document.docx'] = attachment
print(f"Attached: {document_file.name}")
# Attach multiple files
files_to_attach = [
('data.csv', 'Supporting data file', '/Data'),
('image.png', 'Illustration used in document', '/Supplement'),
('readme.txt', 'Instructions and notes', '/Unspecified')
]
for filename, description, relationship in files_to_attach:
file_path = Path(filename)
if file_path.exists():
attachment = pikepdf.AttachedFileSpec.from_filepath(
pdf,
str(file_path),
description=description,
relationship=relationship
)
attachments[filename] = attachment
print(f"Attached: {filename} ({description})")
print(f"Total attachments: {len(attachments)}")
# Save PDF with attachments
pdf.save('document_with_attachments.pdf')
pdf.close()import pikepdf
from pathlib import Path
def extract_all_attachments(pdf_path, output_dir):
"""Extract all attached files from a PDF."""
pdf = pikepdf.open(pdf_path)
attachments = pdf.attachments
if len(attachments) == 0:
print("No attachments found in PDF")
pdf.close()
return
# Create output directory
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
extracted_files = []
print(f"Found {len(attachments)} attachments:")
for name, attachment in attachments.items():
try:
# Get file info
filename = attachment.filename or name
description = attachment.description
size = attachment.size
relationship = attachment.relationship
print(f"\n📎 {filename}")
print(f" Description: {description}")
print(f" Size: {size:,} bytes" if size >= 0 else " Size: Unknown")
print(f" Relationship: {relationship}")
print(f" Created: {attachment.creation_date}")
print(f" Modified: {attachment.modification_date}")
# Extract file content
file_data = attachment.get_file()
# Save to disk
safe_filename = "".join(c for c in filename if c.isalnum() or c in '.-_')
output_file = output_path / safe_filename
# Handle filename conflicts
counter = 1
while output_file.exists():
stem = output_file.stem
suffix = output_file.suffix
output_file = output_path / f"{stem}_{counter}{suffix}"
counter += 1
with open(output_file, 'wb') as f:
f.write(file_data)
extracted_files.append(str(output_file))
print(f" ✓ Extracted to: {output_file}")
# Verify checksum if available
if attachment.checksum:
import hashlib
actual_checksum = hashlib.md5(file_data).hexdigest().upper()
expected_checksum = attachment.checksum.upper()
if actual_checksum == expected_checksum:
print(f" ✓ Checksum verified: {actual_checksum}")
else:
print(f" ⚠️ Checksum mismatch: expected {expected_checksum}, got {actual_checksum}")
except Exception as e:
print(f" ❌ Error extracting {name}: {e}")
pdf.close()
print(f"\nExtracted {len(extracted_files)} files to {output_dir}")
return extracted_files
# Extract attachments
extracted = extract_all_attachments('document_with_attachments.pdf', 'extracted_files')import pikepdf
from datetime import datetime
def update_attachment_metadata(pdf_path):
"""Update metadata for existing attachments."""
pdf = pikepdf.open(pdf_path)
attachments = pdf.attachments
for name, attachment in attachments.items():
print(f"Attachment: {name}")
# Get all filename variants
filenames = attachment.get_all_filenames()
print(f" Filename variants: {filenames}")
# Display current metadata
print(f" Current description: '{attachment.description}'")
print(f" Current relationship: {attachment.relationship}")
print(f" File size: {attachment.size:,} bytes")
print(f" Creation date: {attachment.creation_date}")
print(f" Modification date: {attachment.modification_date}")
print(f" Checksum: {attachment.checksum}")
# Note: Modifying attachment metadata requires recreating the attachment
# This is a limitation of the PDF format and pikepdf's current API
pdf.close()
def create_portfolio_pdf(file_list, output_path):
"""Create a PDF portfolio with multiple attached files."""
# Create new PDF
pdf = pikepdf.new()
# Add a cover page
page = pdf.add_blank_page()
# Add basic content to cover page
content = f"""
BT
/F1 24 Tf
100 700 Td
(PDF Portfolio) Tj
/F1 12 Tf
100 650 Td
(This PDF contains {len(file_list)} attached files:) Tj
"""
y_pos = 620
for i, (file_path, description) in enumerate(file_list):
file_name = Path(file_path).name
content += f"""
100 {y_pos} Td
({i+1}. {file_name}) Tj
"""
y_pos -= 20
content += "\nET"
content_stream = pikepdf.Stream(pdf, content.encode())
page['/Contents'] = content_stream
# Add files as attachments
attachments = pdf.attachments
for file_path, description in file_list:
file_path_obj = Path(file_path)
if file_path_obj.exists():
# Determine relationship based on file type
suffix = file_path_obj.suffix.lower()
if suffix in ['.docx', '.doc', '.odt']:
relationship = '/Source'
elif suffix in ['.csv', '.xlsx', '.json']:
relationship = '/Data'
elif suffix in ['.png', '.jpg', '.jpeg', '.gif']:
relationship = '/Supplement'
else:
relationship = '/Unspecified'
# Create attachment
attachment = pikepdf.AttachedFileSpec.from_filepath(
pdf,
str(file_path_obj),
description=description,
relationship=relationship
)
attachments[file_path_obj.name] = attachment
print(f"Added to portfolio: {file_path_obj.name}")
# Save portfolio
pdf.save(output_path)
pdf.close()
print(f"Created portfolio PDF: {output_path}")
# Create a portfolio with multiple files
portfolio_files = [
('project_report.pdf', 'Main project report'),
('data_analysis.csv', 'Raw data and analysis'),
('chart.png', 'Key findings visualization'),
('source_code.py', 'Analysis script'),
('readme.txt', 'Project documentation')
]
# create_portfolio_pdf(portfolio_files, 'project_portfolio.pdf')import pikepdf
from pathlib import Path
import hashlib
def analyze_pdf_attachments(pdf_path):
"""Comprehensive analysis of PDF attachments."""
pdf = pikepdf.open(pdf_path)
attachments = pdf.attachments
analysis = {
'total_attachments': len(attachments),
'total_size': 0,
'file_types': {},
'relationships': {},
'files': []
}
if analysis['total_attachments'] == 0:
print(f"No attachments found in {pdf_path}")
pdf.close()
return analysis
for name, attachment in attachments.items():
try:
# Basic file info
filename = attachment.filename or name
size = attachment.size if attachment.size >= 0 else 0
# Extract file for analysis
file_data = attachment.get_file()
actual_size = len(file_data)
# File type analysis
file_extension = Path(filename).suffix.lower()
if file_extension:
analysis['file_types'][file_extension] = analysis['file_types'].get(file_extension, 0) + 1
else:
analysis['file_types']['(no extension)'] = analysis['file_types'].get('(no extension)', 0) + 1
# Relationship analysis
relationship = attachment.relationship
analysis['relationships'][relationship] = analysis['relationships'].get(relationship, 0) + 1
# Calculate checksums
md5_hash = hashlib.md5(file_data).hexdigest().upper()
sha256_hash = hashlib.sha256(file_data).hexdigest().upper()
# File details
file_info = {
'name': filename,
'attachment_key': name,
'description': attachment.description,
'size_reported': size,
'size_actual': actual_size,
'size_match': size == actual_size,
'relationship': relationship,
'creation_date': attachment.creation_date,
'modification_date': attachment.modification_date,
'checksum_reported': attachment.checksum,
'checksum_md5': md5_hash,
'checksum_sha256': sha256_hash,
'checksum_verified': attachment.checksum.upper() == md5_hash if attachment.checksum else None,
'file_extension': file_extension,
'filenames_variants': attachment.get_all_filenames()
}
analysis['files'].append(file_info)
analysis['total_size'] += actual_size
except Exception as e:
print(f"Error analyzing attachment '{name}': {e}")
pdf.close()
return analysis
def print_attachment_report(analysis):
"""Print formatted attachment analysis report."""
print("PDF Attachment Analysis Report")
print("=" * 50)
print(f"Total Attachments: {analysis['total_attachments']}")
print(f"Total Size: {analysis['total_size']:,} bytes ({analysis['total_size'] / 1024 / 1024:.2f} MB)")
if analysis['file_types']:
print(f"\nFile Types:")
for ext, count in sorted(analysis['file_types'].items()):
print(f" {ext}: {count} files")
if analysis['relationships']:
print(f"\nFile Relationships:")
for rel, count in sorted(analysis['relationships'].items()):
print(f" {rel}: {count} files")
print(f"\nDetailed File Information:")
print("-" * 50)
for file_info in analysis['files']:
print(f"\n📎 {file_info['name']}")
print(f" Key: {file_info['attachment_key']}")
print(f" Description: {file_info['description']}")
print(f" Size: {file_info['size_actual']:,} bytes", end="")
if not file_info['size_match']:
print(f" (reported: {file_info['size_reported']:,})", end="")
print()
print(f" Type: {file_info['file_extension']}")
print(f" Relationship: {file_info['relationship']}")
print(f" Created: {file_info['creation_date']}")
print(f" Modified: {file_info['modification_date']}")
# Checksum verification
if file_info['checksum_reported']:
verified = file_info['checksum_verified']
status = "✓ Verified" if verified else "❌ Failed"
print(f" Checksum: {status} ({file_info['checksum_reported']})")
else:
print(f" MD5: {file_info['checksum_md5']}")
# Filename variants
variants = file_info['filenames_variants']
if len(variants) > 1:
print(f" Filename variants: {variants}")
# Analyze attachments
pdf_path = 'document_with_attachments.pdf'
if Path(pdf_path).exists():
analysis = analyze_pdf_attachments(pdf_path)
print_attachment_report(analysis)import pikepdf
from pathlib import Path
def add_attachments_to_directory(directory_path, attachment_dir):
"""Add the same set of attachments to all PDFs in a directory."""
directory = Path(directory_path)
attachment_path = Path(attachment_dir)
# Get list of files to attach
attachment_files = list(attachment_path.glob('*'))
attachment_files = [f for f in attachment_files if f.is_file()]
if not attachment_files:
print(f"No files found in {attachment_dir}")
return
# Get list of PDFs to process
pdf_files = list(directory.glob('*.pdf'))
results = {'success': [], 'failed': []}
for pdf_file in pdf_files:
try:
pdf = pikepdf.open(pdf_file)
attachments = pdf.attachments
# Skip if already has attachments
if len(attachments) > 0:
print(f"Skipping {pdf_file.name} - already has attachments")
pdf.close()
continue
# Add each attachment file
attachments_added = 0
for attach_file in attachment_files:
try:
attachment = pikepdf.AttachedFileSpec.from_filepath(
pdf,
str(attach_file),
description=f"Standard attachment: {attach_file.name}",
relationship='/Supplement'
)
attachments[attach_file.name] = attachment
attachments_added += 1
except Exception as e:
print(f"Failed to attach {attach_file.name} to {pdf_file.name}: {e}")
# Save if any attachments were added
if attachments_added > 0:
pdf.save()
results['success'].append((pdf_file.name, attachments_added))
print(f"Added {attachments_added} attachments to {pdf_file.name}")
pdf.close()
except Exception as e:
results['failed'].append((pdf_file.name, str(e)))
print(f"Failed to process {pdf_file.name}: {e}")
print(f"\nBulk attachment complete:")
print(f" Success: {len(results['success'])} PDFs")
print(f" Failed: {len(results['failed'])} PDFs")
def remove_all_attachments(directory_path):
"""Remove all attachments from PDFs in a directory."""
directory = Path(directory_path)
pdf_files = list(directory.glob('*.pdf'))
results = {'processed': 0, 'attachments_removed': 0, 'failed': []}
for pdf_file in pdf_files:
try:
pdf = pikepdf.open(pdf_file)
attachments = pdf.attachments
attachment_count = len(attachments)
if attachment_count > 0:
# Clear all attachments
attachments.clear()
pdf.save()
results['attachments_removed'] += attachment_count
print(f"Removed {attachment_count} attachments from {pdf_file.name}")
results['processed'] += 1
pdf.close()
except Exception as e:
results['failed'].append((pdf_file.name, str(e)))
print(f"Failed to process {pdf_file.name}: {e}")
print(f"\nAttachment removal complete:")
print(f" PDFs processed: {results['processed']}")
print(f" Attachments removed: {results['attachments_removed']}")
print(f" Failed: {len(results['failed'])} PDFs")
# Example usage (commented out to avoid file operations)
# add_attachments_to_directory('./pdfs', './standard_attachments')
# remove_all_attachments('./pdfs')Install with Tessl CLI
npx tessl i tessl/pypi-pikepdf