tessl/pypi-pikepdf

Read and write PDFs with Python, powered by qpdf

—

Pending

Overview

Eval results

Files

File Attachments

Name: tessl/pypi-pikepdf
Author: tessl

Embedded file management including attachment, extraction, and metadata handling for portfolio PDFs and file attachments. These capabilities enable comprehensive file embedding and management within PDF documents.

Capabilities

AttachedFileSpec Class

Individual file attachment specifications with metadata and content management.

class AttachedFileSpec:
    """
    PDF attached file specification for embedded files.
    
    Represents a single file embedded within a PDF document,
    including its content, metadata, and relationship to the document.
    """
    
    @staticmethod
    def from_filepath(pdf: Pdf, path: str, *, description: str = '', 
                     relationship: str = '/Unspecified') -> AttachedFileSpec:
        """
        Create an attached file specification from a file path.
        
        Reads the file from disk and creates a complete attachment
        specification with appropriate metadata and content encoding.
        
        Parameters:
        - pdf (Pdf): PDF document to attach the file to
        - path (str): Path to the file to attach
        - description (str): Human-readable description of the file
        - relationship (str): Relationship to the document 
                            ('/Source', '/Data', '/Alternative', '/Supplement', '/Unspecified')
        
        Returns:
        AttachedFileSpec: Attached file specification ready for embedding
        
        Raises:
        FileNotFoundError: If the specified file doesn't exist
        IOError: If the file cannot be read
        """
    
    def get_file(self) -> bytes:
        """
        Retrieve the attached file's content as bytes.
        
        Extracts and decodes the embedded file data from the PDF.
        
        Returns:
        bytes: Complete file content
        
        Raises:
        DataDecodingError: If file data cannot be decoded
        """
    
    def get_all_filenames(self) -> dict[str, str]:
        """
        Get all filename variants for this attachment.
        
        PDF attachments can have multiple filename variants for
        different platforms and character encodings.
        
        Returns:
        dict[str, str]: Mapping of filename types to actual filenames
                       Keys: 'F', 'UF', 'DOS', 'Mac', 'Unix'
        """
    
    @property
    def filename(self) -> str:
        """
        Primary filename for the attached file.
        
        Returns the most appropriate filename, preferring Unicode
        filenames when available.
        
        Returns:
        str: Filename of the attached file
        """
    
    @property
    def description(self) -> str:
        """
        Human-readable description of the attached file.
        
        Returns:
        str: File description or empty string if none provided
        """
    
    @property
    def relationship(self) -> str:
        """
        Relationship of this file to the PDF document.
        
        Common values:
        - '/Source': Original source file for the PDF
        - '/Data': Data file related to the PDF content
        - '/Alternative': Alternative representation
        - '/Supplement': Supplementary file
        - '/Unspecified': Relationship not specified
        
        Returns:
        str: Relationship type as PDF name
        """
    
    @property
    def size(self) -> int:
        """
        Size of the attached file in bytes.
        
        Returns:
        int: File size, or -1 if size is unknown
        """
    
    @property
    def creation_date(self) -> str:
        """
        Creation date of the attached file.
        
        Returns:
        str: Creation date in PDF date format, or empty if unknown
        """
    
    @property
    def modification_date(self) -> str:
        """
        Last modification date of the attached file.
        
        Returns:
        str: Modification date in PDF date format, or empty if unknown
        """
    
    @property
    def checksum(self) -> str:
        """
        MD5 checksum of the attached file content.
        
        Used for integrity verification of the embedded file.
        
        Returns:
        str: Hex-encoded MD5 hash, or empty if not available
        """

Attachments Class

Collection interface for managing all attachments in a PDF document.

class Attachments:
    """
    Mapping interface for PDF attachments collection.
    
    Provides dictionary-like access to all embedded files in a PDF,
    with methods for adding, removing, and iterating attachments.
    
    Implements MutableMapping[str, AttachedFileSpec] interface.
    """
    
    def __len__(self) -> int:
        """
        Number of attached files in the PDF.
        
        Returns:
        int: Count of embedded files
        """
    
    def __iter__(self) -> Iterator[str]:
        """
        Iterate over attachment names.
        
        Yields:
        str: Filename/key for each attached file
        """
    
    def __getitem__(self, key: str) -> AttachedFileSpec:
        """
        Get an attached file by name.
        
        Parameters:
        - key (str): Attachment filename or key
        
        Returns:
        AttachedFileSpec: Attached file specification
        
        Raises:
        KeyError: If attachment with specified key doesn't exist
        """
    
    def __setitem__(self, key: str, value: AttachedFileSpec) -> None:
        """
        Add or replace an attached file.
        
        Parameters:
        - key (str): Attachment name/key
        - value (AttachedFileSpec): File specification to attach
        """
    
    def __delitem__(self, key: str) -> None:
        """
        Remove an attached file.
        
        Parameters:
        - key (str): Attachment name/key to remove
        
        Raises:
        KeyError: If attachment doesn't exist
        """
    
    def __contains__(self, key: str) -> bool:
        """
        Check if an attachment exists.
        
        Parameters:
        - key (str): Attachment name/key to check
        
        Returns:
        bool: True if attachment exists
        """
    
    def keys(self):
        """
        Get all attachment names.
        
        Returns:
        KeysView: View of all attachment keys
        """
    
    def values(self):
        """
        Get all attachment specifications.
        
        Returns:
        ValuesView: View of all AttachedFileSpec objects
        """
    
    def items(self):
        """
        Get all attachment name-specification pairs.
        
        Returns:
        ItemsView: View of (key, AttachedFileSpec) pairs
        """
    
    def clear(self) -> None:
        """Remove all attachments from the PDF."""

Usage Examples

Adding File Attachments

import pikepdf
from pathlib import Path

# Open or create a PDF
pdf = pikepdf.open('document.pdf')

# Access the attachments collection
attachments = pdf.attachments

# Attach a file from disk
document_file = Path('source_document.docx')
if document_file.exists():
    # Create attachment specification
    attachment = pikepdf.AttachedFileSpec.from_filepath(
        pdf, 
        str(document_file),
        description="Original Word document source",
        relationship='/Source'
    )
    
    # Add to PDF
    attachments['source_document.docx'] = attachment
    print(f"Attached: {document_file.name}")

# Attach multiple files
files_to_attach = [
    ('data.csv', 'Supporting data file', '/Data'),
    ('image.png', 'Illustration used in document', '/Supplement'),
    ('readme.txt', 'Instructions and notes', '/Unspecified')
]

for filename, description, relationship in files_to_attach:
    file_path = Path(filename)
    if file_path.exists():
        attachment = pikepdf.AttachedFileSpec.from_filepath(
            pdf,
            str(file_path),
            description=description,
            relationship=relationship
        )
        attachments[filename] = attachment
        print(f"Attached: {filename} ({description})")

print(f"Total attachments: {len(attachments)}")

# Save PDF with attachments
pdf.save('document_with_attachments.pdf')
pdf.close()

Extracting Attached Files

import pikepdf
from pathlib import Path

def extract_all_attachments(pdf_path, output_dir):
    """Extract all attached files from a PDF."""
    
    pdf = pikepdf.open(pdf_path)
    attachments = pdf.attachments
    
    if len(attachments) == 0:
        print("No attachments found in PDF")
        pdf.close()
        return
    
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    extracted_files = []
    
    print(f"Found {len(attachments)} attachments:")
    
    for name, attachment in attachments.items():
        try:
            # Get file info
            filename = attachment.filename or name
            description = attachment.description
            size = attachment.size
            relationship = attachment.relationship
            
            print(f"\n📎 {filename}")
            print(f"   Description: {description}")
            print(f"   Size: {size:,} bytes" if size >= 0 else "   Size: Unknown")
            print(f"   Relationship: {relationship}")
            print(f"   Created: {attachment.creation_date}")
            print(f"   Modified: {attachment.modification_date}")
            
            # Extract file content
            file_data = attachment.get_file()
            
            # Save to disk
            safe_filename = "".join(c for c in filename if c.isalnum() or c in '.-_')
            output_file = output_path / safe_filename
            
            # Handle filename conflicts
            counter = 1
            while output_file.exists():
                stem = output_file.stem
                suffix = output_file.suffix
                output_file = output_path / f"{stem}_{counter}{suffix}"
                counter += 1
            
            with open(output_file, 'wb') as f:
                f.write(file_data)
            
            extracted_files.append(str(output_file))
            print(f"   ✓ Extracted to: {output_file}")
            
            # Verify checksum if available
            if attachment.checksum:
                import hashlib
                actual_checksum = hashlib.md5(file_data).hexdigest().upper()
                expected_checksum = attachment.checksum.upper()
                
                if actual_checksum == expected_checksum:
                    print(f"   ✓ Checksum verified: {actual_checksum}")
                else:
                    print(f"   ⚠️  Checksum mismatch: expected {expected_checksum}, got {actual_checksum}")
            
        except Exception as e:
            print(f"   ❌ Error extracting {name}: {e}")
    
    pdf.close()
    
    print(f"\nExtracted {len(extracted_files)} files to {output_dir}")
    return extracted_files

# Extract attachments
extracted = extract_all_attachments('document_with_attachments.pdf', 'extracted_files')

Managing Attachment Metadata

import pikepdf
from datetime import datetime

def update_attachment_metadata(pdf_path):
    """Update metadata for existing attachments."""
    
    pdf = pikepdf.open(pdf_path)
    attachments = pdf.attachments
    
    for name, attachment in attachments.items():
        print(f"Attachment: {name}")
        
        # Get all filename variants
        filenames = attachment.get_all_filenames()
        print(f"  Filename variants: {filenames}")
        
        # Display current metadata
        print(f"  Current description: '{attachment.description}'")
        print(f"  Current relationship: {attachment.relationship}")
        print(f"  File size: {attachment.size:,} bytes")
        print(f"  Creation date: {attachment.creation_date}")
        print(f"  Modification date: {attachment.modification_date}")
        print(f"  Checksum: {attachment.checksum}")
        
        # Note: Modifying attachment metadata requires recreating the attachment
        # This is a limitation of the PDF format and pikepdf's current API
        
    pdf.close()

def create_portfolio_pdf(file_list, output_path):
    """Create a PDF portfolio with multiple attached files."""
    
    # Create new PDF
    pdf = pikepdf.new()
    
    # Add a cover page
    page = pdf.add_blank_page()
    
    # Add basic content to cover page
    content = f"""
    BT
    /F1 24 Tf
    100 700 Td
    (PDF Portfolio) Tj
    
    /F1 12 Tf
    100 650 Td
    (This PDF contains {len(file_list)} attached files:) Tj
    """
    
    y_pos = 620
    for i, (file_path, description) in enumerate(file_list):
        file_name = Path(file_path).name
        content += f"""
        100 {y_pos} Td
        ({i+1}. {file_name}) Tj
        """
        y_pos -= 20
    
    content += "\nET"
    
    content_stream = pikepdf.Stream(pdf, content.encode())
    page['/Contents'] = content_stream
    
    # Add files as attachments
    attachments = pdf.attachments
    
    for file_path, description in file_list:
        file_path_obj = Path(file_path)
        
        if file_path_obj.exists():
            # Determine relationship based on file type
            suffix = file_path_obj.suffix.lower()
            if suffix in ['.docx', '.doc', '.odt']:
                relationship = '/Source'
            elif suffix in ['.csv', '.xlsx', '.json']:
                relationship = '/Data'
            elif suffix in ['.png', '.jpg', '.jpeg', '.gif']:
                relationship = '/Supplement'
            else:
                relationship = '/Unspecified'
            
            # Create attachment
            attachment = pikepdf.AttachedFileSpec.from_filepath(
                pdf,
                str(file_path_obj),
                description=description,
                relationship=relationship
            )
            
            attachments[file_path_obj.name] = attachment
            print(f"Added to portfolio: {file_path_obj.name}")
    
    # Save portfolio
    pdf.save(output_path)
    pdf.close()
    
    print(f"Created portfolio PDF: {output_path}")

# Create a portfolio with multiple files
portfolio_files = [
    ('project_report.pdf', 'Main project report'),
    ('data_analysis.csv', 'Raw data and analysis'),
    ('chart.png', 'Key findings visualization'),
    ('source_code.py', 'Analysis script'),
    ('readme.txt', 'Project documentation')
]

# create_portfolio_pdf(portfolio_files, 'project_portfolio.pdf')

Attachment Analysis and Reporting

import pikepdf
from pathlib import Path
import hashlib

def analyze_pdf_attachments(pdf_path):
    """Comprehensive analysis of PDF attachments."""
    
    pdf = pikepdf.open(pdf_path)
    attachments = pdf.attachments
    
    analysis = {
        'total_attachments': len(attachments),
        'total_size': 0,
        'file_types': {},
        'relationships': {},
        'files': []
    }
    
    if analysis['total_attachments'] == 0:
        print(f"No attachments found in {pdf_path}")
        pdf.close()
        return analysis
    
    for name, attachment in attachments.items():
        try:
            # Basic file info
            filename = attachment.filename or name
            size = attachment.size if attachment.size >= 0 else 0
            
            # Extract file for analysis
            file_data = attachment.get_file()
            actual_size = len(file_data)
            
            # File type analysis
            file_extension = Path(filename).suffix.lower()
            if file_extension:
                analysis['file_types'][file_extension] = analysis['file_types'].get(file_extension, 0) + 1
            else:
                analysis['file_types']['(no extension)'] = analysis['file_types'].get('(no extension)', 0) + 1
            
            # Relationship analysis
            relationship = attachment.relationship
            analysis['relationships'][relationship] = analysis['relationships'].get(relationship, 0) + 1
            
            # Calculate checksums
            md5_hash = hashlib.md5(file_data).hexdigest().upper()
            sha256_hash = hashlib.sha256(file_data).hexdigest().upper()
            
            # File details
            file_info = {
                'name': filename,
                'attachment_key': name,
                'description': attachment.description,
                'size_reported': size,
                'size_actual': actual_size,
                'size_match': size == actual_size,
                'relationship': relationship,
                'creation_date': attachment.creation_date,
                'modification_date': attachment.modification_date,
                'checksum_reported': attachment.checksum,
                'checksum_md5': md5_hash,
                'checksum_sha256': sha256_hash,
                'checksum_verified': attachment.checksum.upper() == md5_hash if attachment.checksum else None,
                'file_extension': file_extension,
                'filenames_variants': attachment.get_all_filenames()
            }
            
            analysis['files'].append(file_info)
            analysis['total_size'] += actual_size
            
        except Exception as e:
            print(f"Error analyzing attachment '{name}': {e}")
    
    pdf.close()
    return analysis

def print_attachment_report(analysis):
    """Print formatted attachment analysis report."""
    
    print("PDF Attachment Analysis Report")
    print("=" * 50)
    
    print(f"Total Attachments: {analysis['total_attachments']}")
    print(f"Total Size: {analysis['total_size']:,} bytes ({analysis['total_size'] / 1024 / 1024:.2f} MB)")
    
    if analysis['file_types']:
        print(f"\nFile Types:")
        for ext, count in sorted(analysis['file_types'].items()):
            print(f"  {ext}: {count} files")
    
    if analysis['relationships']:
        print(f"\nFile Relationships:")
        for rel, count in sorted(analysis['relationships'].items()):
            print(f"  {rel}: {count} files")
    
    print(f"\nDetailed File Information:")
    print("-" * 50)
    
    for file_info in analysis['files']:
        print(f"\n📎 {file_info['name']}")
        print(f"   Key: {file_info['attachment_key']}")
        print(f"   Description: {file_info['description']}")
        print(f"   Size: {file_info['size_actual']:,} bytes", end="")
        
        if not file_info['size_match']:
            print(f" (reported: {file_info['size_reported']:,})", end="")
        print()
        
        print(f"   Type: {file_info['file_extension']}")
        print(f"   Relationship: {file_info['relationship']}")
        print(f"   Created: {file_info['creation_date']}")
        print(f"   Modified: {file_info['modification_date']}")
        
        # Checksum verification
        if file_info['checksum_reported']:
            verified = file_info['checksum_verified']
            status = "✓ Verified" if verified else "❌ Failed"
            print(f"   Checksum: {status} ({file_info['checksum_reported']})")
        else:
            print(f"   MD5: {file_info['checksum_md5']}")
        
        # Filename variants
        variants = file_info['filenames_variants']
        if len(variants) > 1:
            print(f"   Filename variants: {variants}")

# Analyze attachments
pdf_path = 'document_with_attachments.pdf'
if Path(pdf_path).exists():
    analysis = analyze_pdf_attachments(pdf_path)
    print_attachment_report(analysis)

Bulk Attachment Operations

import pikepdf
from pathlib import Path

def add_attachments_to_directory(directory_path, attachment_dir):
    """Add the same set of attachments to all PDFs in a directory."""
    
    directory = Path(directory_path)
    attachment_path = Path(attachment_dir)
    
    # Get list of files to attach
    attachment_files = list(attachment_path.glob('*'))
    attachment_files = [f for f in attachment_files if f.is_file()]
    
    if not attachment_files:
        print(f"No files found in {attachment_dir}")
        return
    
    # Get list of PDFs to process
    pdf_files = list(directory.glob('*.pdf'))
    
    results = {'success': [], 'failed': []}
    
    for pdf_file in pdf_files:
        try:
            pdf = pikepdf.open(pdf_file)
            attachments = pdf.attachments
            
            # Skip if already has attachments
            if len(attachments) > 0:
                print(f"Skipping {pdf_file.name} - already has attachments")
                pdf.close()
                continue
            
            # Add each attachment file
            attachments_added = 0
            for attach_file in attachment_files:
                try:
                    attachment = pikepdf.AttachedFileSpec.from_filepath(
                        pdf,
                        str(attach_file),
                        description=f"Standard attachment: {attach_file.name}",
                        relationship='/Supplement'
                    )
                    attachments[attach_file.name] = attachment
                    attachments_added += 1
                    
                except Exception as e:
                    print(f"Failed to attach {attach_file.name} to {pdf_file.name}: {e}")
            
            # Save if any attachments were added
            if attachments_added > 0:
                pdf.save()
                results['success'].append((pdf_file.name, attachments_added))
                print(f"Added {attachments_added} attachments to {pdf_file.name}")
            
            pdf.close()
            
        except Exception as e:
            results['failed'].append((pdf_file.name, str(e)))
            print(f"Failed to process {pdf_file.name}: {e}")
    
    print(f"\nBulk attachment complete:")
    print(f"  Success: {len(results['success'])} PDFs")
    print(f"  Failed: {len(results['failed'])} PDFs")

def remove_all_attachments(directory_path):
    """Remove all attachments from PDFs in a directory."""
    
    directory = Path(directory_path)
    pdf_files = list(directory.glob('*.pdf'))
    
    results = {'processed': 0, 'attachments_removed': 0, 'failed': []}
    
    for pdf_file in pdf_files:
        try:
            pdf = pikepdf.open(pdf_file)
            attachments = pdf.attachments
            
            attachment_count = len(attachments)
            
            if attachment_count > 0:
                # Clear all attachments
                attachments.clear()
                pdf.save()
                
                results['attachments_removed'] += attachment_count
                print(f"Removed {attachment_count} attachments from {pdf_file.name}")
            
            results['processed'] += 1
            pdf.close()
            
        except Exception as e:
            results['failed'].append((pdf_file.name, str(e)))
            print(f"Failed to process {pdf_file.name}: {e}")
    
    print(f"\nAttachment removal complete:")
    print(f"  PDFs processed: {results['processed']}")
    print(f"  Attachments removed: {results['attachments_removed']}")
    print(f"  Failed: {len(results['failed'])} PDFs")

# Example usage (commented out to avoid file operations)
# add_attachments_to_directory('./pdfs', './standard_attachments')
# remove_all_attachments('./pdfs')

Install with Tessl CLI