tessl/pypi-pypdfium2

Python bindings to PDFium for comprehensive PDF manipulation, rendering, and processing

—

Pending

Overview

Eval results

Files

File Attachments

Name: tessl/pypi-pypdfium2
Author: tessl

Management of embedded file attachments within PDF documents. The PdfAttachment class provides comprehensive access to file attachment metadata, data extraction, and modification capabilities.

Capabilities

Attachment Access

Access and enumerate file attachments within PDF documents.

# Document-level attachment methods
def count_attachments(self) -> int:
    """Get total number of file attachments in document."""

def get_attachment(self, index: int) -> PdfAttachment:
    """
    Get attachment by index.
    
    Parameters:
    - index: int, attachment index (0-based)
    
    Returns:
    PdfAttachment: Attachment object
    """

def new_attachment(self, name: str) -> PdfAttachment:
    """
    Create new file attachment.
    
    Parameters:
    - name: str, attachment filename
    
    Returns:
    PdfAttachment: New attachment object (not yet added to document)
    """

def del_attachment(self, index: int):
    """
    Delete attachment by index.
    
    Parameters:
    - index: int, attachment index to delete
    """

Basic attachment operations:

import pypdfium2 as pdfium

pdf = pdfium.PdfDocument("document.pdf")

# Check for attachments
attachment_count = pdf.count_attachments()
print(f"Document has {attachment_count} attachments")

if attachment_count > 0:
    # Process each attachment
    for i in range(attachment_count):
        attachment = pdf.get_attachment(i)
        name = attachment.get_name()
        print(f"Attachment {i}: {name}")

Attachment Properties

Access attachment metadata and parent document reference.

class PdfAttachment:
    @property
    def raw(self) -> FPDF_ATTACHMENT:
        """Raw PDFium attachment handle for low-level operations."""
    
    @property
    def pdf(self) -> PdfDocument:
        """Parent document containing this attachment."""

File Data Management

Extract and modify attachment file data.

def get_name(self) -> str:
    """
    Get attachment filename.
    
    Returns:
    str: Original filename of the attached file
    """

def get_data(self) -> ctypes.Array:
    """
    Get attachment file data.
    
    Returns:
    ctypes.Array: Raw file data as ctypes array
    """

def set_data(self, data):
    """
    Set attachment file data.
    
    Parameters:
    - data: bytes or ctypes array containing new file data
    """

File data operations:

pdf = pdfium.PdfDocument("document.pdf")

for i in range(pdf.count_attachments()):
    attachment = pdf.get_attachment(i)
    
    # Get attachment information
    filename = attachment.get_name()
    file_data = attachment.get_data()
    
    print(f"Attachment: {filename}")
    print(f"Size: {len(file_data)} bytes")
    
    # Extract attachment to file
    output_path = f"extracted_{filename}"
    with open(output_path, "wb") as f:
        f.write(bytes(file_data))
    
    print(f"Extracted to: {output_path}")

Metadata Management

Access and modify attachment metadata including custom properties.

def has_key(self, key: str) -> bool:
    """
    Check if metadata key exists.
    
    Parameters:
    - key: str, metadata key name
    
    Returns:
    bool: True if key exists, False otherwise
    """

def get_value_type(self, key: str) -> int:
    """
    Get metadata value type.
    
    Parameters:
    - key: str, metadata key name
    
    Returns:
    int: PDFium value type constant
    """

def get_str_value(self, key: str) -> str:
    """
    Get string metadata value.
    
    Parameters:
    - key: str, metadata key name
    
    Returns:
    str: Metadata value as string, empty if key doesn't exist
    """

def set_str_value(self, key: str, value: str):
    """
    Set string metadata value.
    
    Parameters:
    - key: str, metadata key name
    - value: str, metadata value to set
    """

Metadata operations:

pdf = pdfium.PdfDocument("document.pdf")
attachment = pdf.get_attachment(0)

# Common metadata keys
metadata_keys = [
    "Title",        # File title/description
    "Author",       # File author
    "Subject",      # File subject
    "Keywords",     # File keywords
    "Creator",      # Creating application
    "Producer",     # PDF producer
    "CreationDate", # Creation date
    "ModDate"       # Modification date
]

print(f"Attachment: {attachment.get_name()}")
print("Metadata:")

for key in metadata_keys:
    if attachment.has_key(key):
        value = attachment.get_str_value(key)
        value_type = attachment.get_value_type(key)
        print(f"  {key}: {value} (type: {value_type})")

# Set custom metadata
attachment.set_str_value("CustomField", "Custom Value")
attachment.set_str_value("ExtractedBy", "pypdfium2")

# Verify changes
if attachment.has_key("CustomField"):
    custom_value = attachment.get_str_value("CustomField")
    print(f"Custom field: {custom_value}")

Creating New Attachments

Add new file attachments to PDF documents.

def add_file_attachment(pdf, file_path, attachment_name=None):
    """Add file as attachment to PDF document."""
    import os
    
    # Use filename if no attachment name provided
    if attachment_name is None:
        attachment_name = os.path.basename(file_path)
    
    # Create new attachment
    attachment = pdf.new_attachment(attachment_name)
    
    # Read file data
    with open(file_path, "rb") as f:
        file_data = f.read()
    
    # Set attachment data
    attachment.set_data(file_data)
    
    # Set metadata
    attachment.set_str_value("Title", attachment_name)
    attachment.set_str_value("CreationDate", "D:20240101120000")
    attachment.set_str_value("ModDate", "D:20240101120000")
    
    print(f"Added attachment: {attachment_name} ({len(file_data)} bytes)")
    
    return attachment

# Usage
pdf = pdfium.PdfDocument("document.pdf")

# Add a text file as attachment
add_file_attachment(pdf, "readme.txt", "README")

# Add an image as attachment  
add_file_attachment(pdf, "chart.png", "Chart Image")

# Save document with new attachments
pdf.save("document_with_attachments.pdf")

Attachment Analysis

Analyze and report on document attachments.

def analyze_attachments(pdf):
    """Comprehensive attachment analysis."""
    
    count = pdf.count_attachments()
    
    if count == 0:
        print("Document contains no attachments")
        return
    
    print(f"Document contains {count} attachment(s)")
    
    total_size = 0
    file_types = {}
    
    for i in range(count):
        attachment = pdf.get_attachment(i)
        
        # Basic information
        name = attachment.get_name()
        data = attachment.get_data()
        size = len(data)
        total_size += size
        
        # File extension analysis
        ext = name.split('.')[-1].lower() if '.' in name else 'no_ext'
        file_types[ext] = file_types.get(ext, 0) + 1
        
        print(f"\nAttachment {i+1}: {name}")
        print(f"  Size: {size:,} bytes ({size/1024:.1f} KB)")
        
        # Analyze metadata
        metadata_keys = ["Title", "Author", "Subject", "CreationDate", "ModDate"]
        metadata_found = False
        
        for key in metadata_keys:
            if attachment.has_key(key):
                value = attachment.get_str_value(key)
                if value:
                    if not metadata_found:
                        print("  Metadata:")
                        metadata_found = True
                    print(f"    {key}: {value}")
        
        if not metadata_found:
            print("  No metadata found")
        
        # File type detection (basic)
        file_signature = bytes(data[:16])
        if file_signature.startswith(b'\xFF\xD8\xFF'):
            print("  Detected: JPEG image")
        elif file_signature.startswith(b'\x89PNG'):
            print("  Detected: PNG image")
        elif file_signature.startswith(b'%PDF'):
            print("  Detected: PDF document")
        elif file_signature.startswith(b'PK'):
            print("  Detected: ZIP archive or Office document")
    
    # Summary
    print(f"\nSummary:")
    print(f"  Total attachments: {count}")
    print(f"  Total size: {total_size:,} bytes ({total_size/1024:.1f} KB)")
    print(f"  File types: {dict(file_types)}")

# Usage
pdf = pdfium.PdfDocument("document.pdf")
analyze_attachments(pdf)

Batch Attachment Processing

Process multiple attachments efficiently.

def extract_all_attachments(pdf, output_dir):
    """Extract all attachments to specified directory."""
    import os
    
    os.makedirs(output_dir, exist_ok=True)
    
    count = pdf.count_attachments()
    if count == 0:
        print("No attachments to extract")
        return
    
    extracted = 0
    failed = 0
    
    for i in range(count):
        try:
            attachment = pdf.get_attachment(i)
            name = attachment.get_name()
            data = attachment.get_data()
            
            # Sanitize filename
            safe_name = "".join(c for c in name if c.isalnum() or c in "._- ")
            if not safe_name:
                safe_name = f"attachment_{i}"
            
            output_path = os.path.join(output_dir, safe_name)
            
            # Handle filename conflicts
            counter = 1
            original_path = output_path
            while os.path.exists(output_path):
                name_parts = original_path.rsplit('.', 1)
                if len(name_parts) == 2:
                    output_path = f"{name_parts[0]}_{counter}.{name_parts[1]}"
                else:
                    output_path = f"{original_path}_{counter}"
                counter += 1
            
            # Write file
            with open(output_path, "wb") as f:
                f.write(bytes(data))
            
            print(f"Extracted: {name} -> {output_path}")
            extracted += 1
            
        except Exception as e:
            print(f"Failed to extract attachment {i}: {e}")
            failed += 1
    
    print(f"\nExtraction complete: {extracted} successful, {failed} failed")
    return extracted, failed

# Usage
pdf = pdfium.PdfDocument("document.pdf")
extract_all_attachments(pdf, "extracted_attachments")

Attachment Security

Handle attachment security and validation.

def validate_attachments(pdf, max_size_mb=10, allowed_extensions=None):
    """Validate attachments for security and size constraints."""
    
    if allowed_extensions is None:
        allowed_extensions = {'.txt', '.pdf', '.jpg', '.png', '.gif', '.doc', '.docx'}
    
    count = pdf.count_attachments()
    issues = []
    
    for i in range(count):
        attachment = pdf.get_attachment(i)
        name = attachment.get_name()
        data = attachment.get_data()
        size_mb = len(data) / (1024 * 1024)
        
        # Size check
        if size_mb > max_size_mb:
            issues.append(f"Attachment {i} '{name}': Size {size_mb:.1f}MB exceeds limit {max_size_mb}MB")
        
        # Extension check
        ext = '.' + name.split('.')[-1].lower() if '.' in name else ''
        if ext not in allowed_extensions:
            issues.append(f"Attachment {i} '{name}': Extension '{ext}' not allowed")
        
        # Basic content validation
        file_data = bytes(data[:16])
        if ext in ['.jpg', '.jpeg'] and not file_data.startswith(b'\xFF\xD8\xFF'):
            issues.append(f"Attachment {i} '{name}': JPEG header mismatch")
        elif ext == '.png' and not file_data.startswith(b'\x89PNG'):
            issues.append(f"Attachment {i} '{name}': PNG header mismatch")
        elif ext == '.pdf' and not file_data.startswith(b'%PDF'):
            issues.append(f"Attachment {i} '{name}': PDF header mismatch")
    
    if issues:
        print("Attachment validation issues:")
        for issue in issues:
            print(f"  - {issue}")
        return False
    else:
        print(f"All {count} attachments passed validation")
        return True

# Usage
pdf = pdfium.PdfDocument("document.pdf")
is_valid = validate_attachments(pdf, max_size_mb=5)

Common Attachment Operations

Attachment Backup

def backup_attachments(pdf, backup_path):
    """Create backup of all attachments as ZIP file."""
    import zipfile
    import io
    
    count = pdf.count_attachments()
    if count == 0:
        return False
    
    with zipfile.ZipFile(backup_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        for i in range(count):
            attachment = pdf.get_attachment(i)
            name = attachment.get_name()
            data = bytes(attachment.get_data())
            
            # Add to ZIP with metadata
            zf.writestr(name, data)
    
    print(f"Backed up {count} attachments to {backup_path}")
    return True

# Usage
pdf = pdfium.PdfDocument("document.pdf")
backup_attachments(pdf, "attachments_backup.zip")

Install with Tessl CLI