Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing
—
Management of embedded file attachments within PDF documents. The PdfAttachment class provides comprehensive access to file attachment metadata, data extraction, and modification capabilities.
Access and enumerate file attachments within PDF documents.
# Document-level attachment methods
def count_attachments(self) -> int:
"""Get total number of file attachments in document."""
def get_attachment(self, index: int) -> PdfAttachment:
"""
Get attachment by index.
Parameters:
- index: int, attachment index (0-based)
Returns:
PdfAttachment: Attachment object
"""
def new_attachment(self, name: str) -> PdfAttachment:
"""
Create new file attachment.
Parameters:
- name: str, attachment filename
Returns:
PdfAttachment: New attachment object (not yet added to document)
"""
def del_attachment(self, index: int):
"""
Delete attachment by index.
Parameters:
- index: int, attachment index to delete
"""Basic attachment operations:
import pypdfium2 as pdfium
pdf = pdfium.PdfDocument("document.pdf")
# Check for attachments
attachment_count = pdf.count_attachments()
print(f"Document has {attachment_count} attachments")
if attachment_count > 0:
# Process each attachment
for i in range(attachment_count):
attachment = pdf.get_attachment(i)
name = attachment.get_name()
print(f"Attachment {i}: {name}")Access attachment metadata and parent document reference.
class PdfAttachment:
@property
def raw(self) -> FPDF_ATTACHMENT:
"""Raw PDFium attachment handle for low-level operations."""
@property
def pdf(self) -> PdfDocument:
"""Parent document containing this attachment."""Extract and modify attachment file data.
def get_name(self) -> str:
"""
Get attachment filename.
Returns:
str: Original filename of the attached file
"""
def get_data(self) -> ctypes.Array:
"""
Get attachment file data.
Returns:
ctypes.Array: Raw file data as ctypes array
"""
def set_data(self, data):
"""
Set attachment file data.
Parameters:
- data: bytes or ctypes array containing new file data
"""File data operations:
pdf = pdfium.PdfDocument("document.pdf")
for i in range(pdf.count_attachments()):
attachment = pdf.get_attachment(i)
# Get attachment information
filename = attachment.get_name()
file_data = attachment.get_data()
print(f"Attachment: {filename}")
print(f"Size: {len(file_data)} bytes")
# Extract attachment to file
output_path = f"extracted_{filename}"
with open(output_path, "wb") as f:
f.write(bytes(file_data))
print(f"Extracted to: {output_path}")Access and modify attachment metadata including custom properties.
def has_key(self, key: str) -> bool:
"""
Check if metadata key exists.
Parameters:
- key: str, metadata key name
Returns:
bool: True if key exists, False otherwise
"""
def get_value_type(self, key: str) -> int:
"""
Get metadata value type.
Parameters:
- key: str, metadata key name
Returns:
int: PDFium value type constant
"""
def get_str_value(self, key: str) -> str:
"""
Get string metadata value.
Parameters:
- key: str, metadata key name
Returns:
str: Metadata value as string, empty if key doesn't exist
"""
def set_str_value(self, key: str, value: str):
"""
Set string metadata value.
Parameters:
- key: str, metadata key name
- value: str, metadata value to set
"""Metadata operations:
pdf = pdfium.PdfDocument("document.pdf")
attachment = pdf.get_attachment(0)
# Common metadata keys
metadata_keys = [
"Title", # File title/description
"Author", # File author
"Subject", # File subject
"Keywords", # File keywords
"Creator", # Creating application
"Producer", # PDF producer
"CreationDate", # Creation date
"ModDate" # Modification date
]
print(f"Attachment: {attachment.get_name()}")
print("Metadata:")
for key in metadata_keys:
if attachment.has_key(key):
value = attachment.get_str_value(key)
value_type = attachment.get_value_type(key)
print(f" {key}: {value} (type: {value_type})")
# Set custom metadata
attachment.set_str_value("CustomField", "Custom Value")
attachment.set_str_value("ExtractedBy", "pypdfium2")
# Verify changes
if attachment.has_key("CustomField"):
custom_value = attachment.get_str_value("CustomField")
print(f"Custom field: {custom_value}")Add new file attachments to PDF documents.
def add_file_attachment(pdf, file_path, attachment_name=None):
"""Add file as attachment to PDF document."""
import os
# Use filename if no attachment name provided
if attachment_name is None:
attachment_name = os.path.basename(file_path)
# Create new attachment
attachment = pdf.new_attachment(attachment_name)
# Read file data
with open(file_path, "rb") as f:
file_data = f.read()
# Set attachment data
attachment.set_data(file_data)
# Set metadata
attachment.set_str_value("Title", attachment_name)
attachment.set_str_value("CreationDate", "D:20240101120000")
attachment.set_str_value("ModDate", "D:20240101120000")
print(f"Added attachment: {attachment_name} ({len(file_data)} bytes)")
return attachment
# Usage
pdf = pdfium.PdfDocument("document.pdf")
# Add a text file as attachment
add_file_attachment(pdf, "readme.txt", "README")
# Add an image as attachment
add_file_attachment(pdf, "chart.png", "Chart Image")
# Save document with new attachments
pdf.save("document_with_attachments.pdf")Analyze and report on document attachments.
def analyze_attachments(pdf):
"""Comprehensive attachment analysis."""
count = pdf.count_attachments()
if count == 0:
print("Document contains no attachments")
return
print(f"Document contains {count} attachment(s)")
total_size = 0
file_types = {}
for i in range(count):
attachment = pdf.get_attachment(i)
# Basic information
name = attachment.get_name()
data = attachment.get_data()
size = len(data)
total_size += size
# File extension analysis
ext = name.split('.')[-1].lower() if '.' in name else 'no_ext'
file_types[ext] = file_types.get(ext, 0) + 1
print(f"\nAttachment {i+1}: {name}")
print(f" Size: {size:,} bytes ({size/1024:.1f} KB)")
# Analyze metadata
metadata_keys = ["Title", "Author", "Subject", "CreationDate", "ModDate"]
metadata_found = False
for key in metadata_keys:
if attachment.has_key(key):
value = attachment.get_str_value(key)
if value:
if not metadata_found:
print(" Metadata:")
metadata_found = True
print(f" {key}: {value}")
if not metadata_found:
print(" No metadata found")
# File type detection (basic)
file_signature = bytes(data[:16])
if file_signature.startswith(b'\xFF\xD8\xFF'):
print(" Detected: JPEG image")
elif file_signature.startswith(b'\x89PNG'):
print(" Detected: PNG image")
elif file_signature.startswith(b'%PDF'):
print(" Detected: PDF document")
elif file_signature.startswith(b'PK'):
print(" Detected: ZIP archive or Office document")
# Summary
print(f"\nSummary:")
print(f" Total attachments: {count}")
print(f" Total size: {total_size:,} bytes ({total_size/1024:.1f} KB)")
print(f" File types: {dict(file_types)}")
# Usage
pdf = pdfium.PdfDocument("document.pdf")
analyze_attachments(pdf)Process multiple attachments efficiently.
def extract_all_attachments(pdf, output_dir):
"""Extract all attachments to specified directory."""
import os
os.makedirs(output_dir, exist_ok=True)
count = pdf.count_attachments()
if count == 0:
print("No attachments to extract")
return
extracted = 0
failed = 0
for i in range(count):
try:
attachment = pdf.get_attachment(i)
name = attachment.get_name()
data = attachment.get_data()
# Sanitize filename
safe_name = "".join(c for c in name if c.isalnum() or c in "._- ")
if not safe_name:
safe_name = f"attachment_{i}"
output_path = os.path.join(output_dir, safe_name)
# Handle filename conflicts
counter = 1
original_path = output_path
while os.path.exists(output_path):
name_parts = original_path.rsplit('.', 1)
if len(name_parts) == 2:
output_path = f"{name_parts[0]}_{counter}.{name_parts[1]}"
else:
output_path = f"{original_path}_{counter}"
counter += 1
# Write file
with open(output_path, "wb") as f:
f.write(bytes(data))
print(f"Extracted: {name} -> {output_path}")
extracted += 1
except Exception as e:
print(f"Failed to extract attachment {i}: {e}")
failed += 1
print(f"\nExtraction complete: {extracted} successful, {failed} failed")
return extracted, failed
# Usage
pdf = pdfium.PdfDocument("document.pdf")
extract_all_attachments(pdf, "extracted_attachments")Handle attachment security and validation.
def validate_attachments(pdf, max_size_mb=10, allowed_extensions=None):
"""Validate attachments for security and size constraints."""
if allowed_extensions is None:
allowed_extensions = {'.txt', '.pdf', '.jpg', '.png', '.gif', '.doc', '.docx'}
count = pdf.count_attachments()
issues = []
for i in range(count):
attachment = pdf.get_attachment(i)
name = attachment.get_name()
data = attachment.get_data()
size_mb = len(data) / (1024 * 1024)
# Size check
if size_mb > max_size_mb:
issues.append(f"Attachment {i} '{name}': Size {size_mb:.1f}MB exceeds limit {max_size_mb}MB")
# Extension check
ext = '.' + name.split('.')[-1].lower() if '.' in name else ''
if ext not in allowed_extensions:
issues.append(f"Attachment {i} '{name}': Extension '{ext}' not allowed")
# Basic content validation
file_data = bytes(data[:16])
if ext in ['.jpg', '.jpeg'] and not file_data.startswith(b'\xFF\xD8\xFF'):
issues.append(f"Attachment {i} '{name}': JPEG header mismatch")
elif ext == '.png' and not file_data.startswith(b'\x89PNG'):
issues.append(f"Attachment {i} '{name}': PNG header mismatch")
elif ext == '.pdf' and not file_data.startswith(b'%PDF'):
issues.append(f"Attachment {i} '{name}': PDF header mismatch")
if issues:
print("Attachment validation issues:")
for issue in issues:
print(f" - {issue}")
return False
else:
print(f"All {count} attachments passed validation")
return True
# Usage
pdf = pdfium.PdfDocument("document.pdf")
is_valid = validate_attachments(pdf, max_size_mb=5)def backup_attachments(pdf, backup_path):
"""Create backup of all attachments as ZIP file."""
import zipfile
import io
count = pdf.count_attachments()
if count == 0:
return False
with zipfile.ZipFile(backup_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for i in range(count):
attachment = pdf.get_attachment(i)
name = attachment.get_name()
data = bytes(attachment.get_data())
# Add to ZIP with metadata
zf.writestr(name, data)
print(f"Backed up {count} attachments to {backup_path}")
return True
# Usage
pdf = pdfium.PdfDocument("document.pdf")
backup_attachments(pdf, "attachments_backup.zip")Install with Tessl CLI
npx tessl i tessl/pypi-pypdfium2