Tessl Tile for pypi/pikepdf@9.10.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced.md attachments.md content-streams.md core-operations.md encryption.md forms.md images.md index.md metadata.md objects.md outlines.md pages.md

attachments.mddocs/

0
# File Attachments
1

2
Embedded file management including attachment, extraction, and metadata handling for portfolio PDFs and file attachments. These capabilities enable comprehensive file embedding and management within PDF documents.
3

4
## Capabilities
5

6
### AttachedFileSpec Class
7

8
Individual file attachment specifications with metadata and content management.
9

10
```python { .api }
11
class AttachedFileSpec:
12
    """
13
    PDF attached file specification for embedded files.
14
    
15
    Represents a single file embedded within a PDF document,
16
    including its content, metadata, and relationship to the document.
17
    """
18
    
19
    @staticmethod
20
    def from_filepath(pdf: Pdf, path: str, *, description: str = '', 
21
                     relationship: str = '/Unspecified') -> AttachedFileSpec:
22
        """
23
        Create an attached file specification from a file path.
24
        
25
        Reads the file from disk and creates a complete attachment
26
        specification with appropriate metadata and content encoding.
27
        
28
        Parameters:
29
        - pdf (Pdf): PDF document to attach the file to
30
        - path (str): Path to the file to attach
31
        - description (str): Human-readable description of the file
32
        - relationship (str): Relationship to the document 
33
                            ('/Source', '/Data', '/Alternative', '/Supplement', '/Unspecified')
34
        
35
        Returns:
36
        AttachedFileSpec: Attached file specification ready for embedding
37
        
38
        Raises:
39
        FileNotFoundError: If the specified file doesn't exist
40
        IOError: If the file cannot be read
41
        """
42
    
43
    def get_file(self) -> bytes:
44
        """
45
        Retrieve the attached file's content as bytes.
46
        
47
        Extracts and decodes the embedded file data from the PDF.
48
        
49
        Returns:
50
        bytes: Complete file content
51
        
52
        Raises:
53
        DataDecodingError: If file data cannot be decoded
54
        """
55
    
56
    def get_all_filenames(self) -> dict[str, str]:
57
        """
58
        Get all filename variants for this attachment.
59
        
60
        PDF attachments can have multiple filename variants for
61
        different platforms and character encodings.
62
        
63
        Returns:
64
        dict[str, str]: Mapping of filename types to actual filenames
65
                       Keys: 'F', 'UF', 'DOS', 'Mac', 'Unix'
66
        """
67
    
68
    @property
69
    def filename(self) -> str:
70
        """
71
        Primary filename for the attached file.
72
        
73
        Returns the most appropriate filename, preferring Unicode
74
        filenames when available.
75
        
76
        Returns:
77
        str: Filename of the attached file
78
        """
79
    
80
    @property
81
    def description(self) -> str:
82
        """
83
        Human-readable description of the attached file.
84
        
85
        Returns:
86
        str: File description or empty string if none provided
87
        """
88
    
89
    @property
90
    def relationship(self) -> str:
91
        """
92
        Relationship of this file to the PDF document.
93
        
94
        Common values:
95
        - '/Source': Original source file for the PDF
96
        - '/Data': Data file related to the PDF content
97
        - '/Alternative': Alternative representation
98
        - '/Supplement': Supplementary file
99
        - '/Unspecified': Relationship not specified
100
        
101
        Returns:
102
        str: Relationship type as PDF name
103
        """
104
    
105
    @property
106
    def size(self) -> int:
107
        """
108
        Size of the attached file in bytes.
109
        
110
        Returns:
111
        int: File size, or -1 if size is unknown
112
        """
113
    
114
    @property
115
    def creation_date(self) -> str:
116
        """
117
        Creation date of the attached file.
118
        
119
        Returns:
120
        str: Creation date in PDF date format, or empty if unknown
121
        """
122
    
123
    @property
124
    def modification_date(self) -> str:
125
        """
126
        Last modification date of the attached file.
127
        
128
        Returns:
129
        str: Modification date in PDF date format, or empty if unknown
130
        """
131
    
132
    @property
133
    def checksum(self) -> str:
134
        """
135
        MD5 checksum of the attached file content.
136
        
137
        Used for integrity verification of the embedded file.
138
        
139
        Returns:
140
        str: Hex-encoded MD5 hash, or empty if not available
141
        """
142
```
143

144
### Attachments Class
145

146
Collection interface for managing all attachments in a PDF document.
147

148
```python { .api }
149
class Attachments:
150
    """
151
    Mapping interface for PDF attachments collection.
152
    
153
    Provides dictionary-like access to all embedded files in a PDF,
154
    with methods for adding, removing, and iterating attachments.
155
    
156
    Implements MutableMapping[str, AttachedFileSpec] interface.
157
    """
158
    
159
    def __len__(self) -> int:
160
        """
161
        Number of attached files in the PDF.
162
        
163
        Returns:
164
        int: Count of embedded files
165
        """
166
    
167
    def __iter__(self) -> Iterator[str]:
168
        """
169
        Iterate over attachment names.
170
        
171
        Yields:
172
        str: Filename/key for each attached file
173
        """
174
    
175
    def __getitem__(self, key: str) -> AttachedFileSpec:
176
        """
177
        Get an attached file by name.
178
        
179
        Parameters:
180
        - key (str): Attachment filename or key
181
        
182
        Returns:
183
        AttachedFileSpec: Attached file specification
184
        
185
        Raises:
186
        KeyError: If attachment with specified key doesn't exist
187
        """
188
    
189
    def __setitem__(self, key: str, value: AttachedFileSpec) -> None:
190
        """
191
        Add or replace an attached file.
192
        
193
        Parameters:
194
        - key (str): Attachment name/key
195
        - value (AttachedFileSpec): File specification to attach
196
        """
197
    
198
    def __delitem__(self, key: str) -> None:
199
        """
200
        Remove an attached file.
201
        
202
        Parameters:
203
        - key (str): Attachment name/key to remove
204
        
205
        Raises:
206
        KeyError: If attachment doesn't exist
207
        """
208
    
209
    def __contains__(self, key: str) -> bool:
210
        """
211
        Check if an attachment exists.
212
        
213
        Parameters:
214
        - key (str): Attachment name/key to check
215
        
216
        Returns:
217
        bool: True if attachment exists
218
        """
219
    
220
    def keys(self):
221
        """
222
        Get all attachment names.
223
        
224
        Returns:
225
        KeysView: View of all attachment keys
226
        """
227
    
228
    def values(self):
229
        """
230
        Get all attachment specifications.
231
        
232
        Returns:
233
        ValuesView: View of all AttachedFileSpec objects
234
        """
235
    
236
    def items(self):
237
        """
238
        Get all attachment name-specification pairs.
239
        
240
        Returns:
241
        ItemsView: View of (key, AttachedFileSpec) pairs
242
        """
243
    
244
    def clear(self) -> None:
245
        """Remove all attachments from the PDF."""
246
```
247

248
## Usage Examples
249

250
### Adding File Attachments
251

252
```python
253
import pikepdf
254
from pathlib import Path
255

256
# Open or create a PDF
257
pdf = pikepdf.open('document.pdf')
258

259
# Access the attachments collection
260
attachments = pdf.attachments
261

262
# Attach a file from disk
263
document_file = Path('source_document.docx')
264
if document_file.exists():
265
    # Create attachment specification
266
    attachment = pikepdf.AttachedFileSpec.from_filepath(
267
        pdf, 
268
        str(document_file),
269
        description="Original Word document source",
270
        relationship='/Source'
271
    )
272
    
273
    # Add to PDF
274
    attachments['source_document.docx'] = attachment
275
    print(f"Attached: {document_file.name}")
276

277
# Attach multiple files
278
files_to_attach = [
279
    ('data.csv', 'Supporting data file', '/Data'),
280
    ('image.png', 'Illustration used in document', '/Supplement'),
281
    ('readme.txt', 'Instructions and notes', '/Unspecified')
282
]
283

284
for filename, description, relationship in files_to_attach:
285
    file_path = Path(filename)
286
    if file_path.exists():
287
        attachment = pikepdf.AttachedFileSpec.from_filepath(
288
            pdf,
289
            str(file_path),
290
            description=description,
291
            relationship=relationship
292
        )
293
        attachments[filename] = attachment
294
        print(f"Attached: {filename} ({description})")
295

296
print(f"Total attachments: {len(attachments)}")
297

298
# Save PDF with attachments
299
pdf.save('document_with_attachments.pdf')
300
pdf.close()
301
```
302

303
### Extracting Attached Files
304

305
```python
306
import pikepdf
307
from pathlib import Path
308

309
def extract_all_attachments(pdf_path, output_dir):
310
    """Extract all attached files from a PDF."""
311
    
312
    pdf = pikepdf.open(pdf_path)
313
    attachments = pdf.attachments
314
    
315
    if len(attachments) == 0:
316
        print("No attachments found in PDF")
317
        pdf.close()
318
        return
319
    
320
    # Create output directory
321
    output_path = Path(output_dir)
322
    output_path.mkdir(exist_ok=True)
323
    
324
    extracted_files = []
325
    
326
    print(f"Found {len(attachments)} attachments:")
327
    
328
    for name, attachment in attachments.items():
329
        try:
330
            # Get file info
331
            filename = attachment.filename or name
332
            description = attachment.description
333
            size = attachment.size
334
            relationship = attachment.relationship
335
            
336
            print(f"\n📎 {filename}")
337
            print(f"   Description: {description}")
338
            print(f"   Size: {size:,} bytes" if size >= 0 else "   Size: Unknown")
339
            print(f"   Relationship: {relationship}")
340
            print(f"   Created: {attachment.creation_date}")
341
            print(f"   Modified: {attachment.modification_date}")
342
            
343
            # Extract file content
344
            file_data = attachment.get_file()
345
            
346
            # Save to disk
347
            safe_filename = "".join(c for c in filename if c.isalnum() or c in '.-_')
348
            output_file = output_path / safe_filename
349
            
350
            # Handle filename conflicts
351
            counter = 1
352
            while output_file.exists():
353
                stem = output_file.stem
354
                suffix = output_file.suffix
355
                output_file = output_path / f"{stem}_{counter}{suffix}"
356
                counter += 1
357
            
358
            with open(output_file, 'wb') as f:
359
                f.write(file_data)
360
            
361
            extracted_files.append(str(output_file))
362
            print(f"   ✓ Extracted to: {output_file}")
363
            
364
            # Verify checksum if available
365
            if attachment.checksum:
366
                import hashlib
367
                actual_checksum = hashlib.md5(file_data).hexdigest().upper()
368
                expected_checksum = attachment.checksum.upper()
369
                
370
                if actual_checksum == expected_checksum:
371
                    print(f"   ✓ Checksum verified: {actual_checksum}")
372
                else:
373
                    print(f"   ⚠️  Checksum mismatch: expected {expected_checksum}, got {actual_checksum}")
374
            
375
        except Exception as e:
376
            print(f"   ❌ Error extracting {name}: {e}")
377
    
378
    pdf.close()
379
    
380
    print(f"\nExtracted {len(extracted_files)} files to {output_dir}")
381
    return extracted_files
382

383
# Extract attachments
384
extracted = extract_all_attachments('document_with_attachments.pdf', 'extracted_files')
385
```
386

387
### Managing Attachment Metadata
388

389
```python
390
import pikepdf
391
from datetime import datetime
392

393
def update_attachment_metadata(pdf_path):
394
    """Update metadata for existing attachments."""
395
    
396
    pdf = pikepdf.open(pdf_path)
397
    attachments = pdf.attachments
398
    
399
    for name, attachment in attachments.items():
400
        print(f"Attachment: {name}")
401
        
402
        # Get all filename variants
403
        filenames = attachment.get_all_filenames()
404
        print(f"  Filename variants: {filenames}")
405
        
406
        # Display current metadata
407
        print(f"  Current description: '{attachment.description}'")
408
        print(f"  Current relationship: {attachment.relationship}")
409
        print(f"  File size: {attachment.size:,} bytes")
410
        print(f"  Creation date: {attachment.creation_date}")
411
        print(f"  Modification date: {attachment.modification_date}")
412
        print(f"  Checksum: {attachment.checksum}")
413
        
414
        # Note: Modifying attachment metadata requires recreating the attachment
415
        # This is a limitation of the PDF format and pikepdf's current API
416
        
417
    pdf.close()
418

419
def create_portfolio_pdf(file_list, output_path):
420
    """Create a PDF portfolio with multiple attached files."""
421
    
422
    # Create new PDF
423
    pdf = pikepdf.new()
424
    
425
    # Add a cover page
426
    page = pdf.add_blank_page()
427
    
428
    # Add basic content to cover page
429
    content = f"""
430
    BT
431
    /F1 24 Tf
432
    100 700 Td
433
    (PDF Portfolio) Tj
434
    
435
    /F1 12 Tf
436
    100 650 Td
437
    (This PDF contains {len(file_list)} attached files:) Tj
438
    """
439
    
440
    y_pos = 620
441
    for i, (file_path, description) in enumerate(file_list):
442
        file_name = Path(file_path).name
443
        content += f"""
444
        100 {y_pos} Td
445
        ({i+1}. {file_name}) Tj
446
        """
447
        y_pos -= 20
448
    
449
    content += "\nET"
450
    
451
    content_stream = pikepdf.Stream(pdf, content.encode())
452
    page['/Contents'] = content_stream
453
    
454
    # Add files as attachments
455
    attachments = pdf.attachments
456
    
457
    for file_path, description in file_list:
458
        file_path_obj = Path(file_path)
459
        
460
        if file_path_obj.exists():
461
            # Determine relationship based on file type
462
            suffix = file_path_obj.suffix.lower()
463
            if suffix in ['.docx', '.doc', '.odt']:
464
                relationship = '/Source'
465
            elif suffix in ['.csv', '.xlsx', '.json']:
466
                relationship = '/Data'
467
            elif suffix in ['.png', '.jpg', '.jpeg', '.gif']:
468
                relationship = '/Supplement'
469
            else:
470
                relationship = '/Unspecified'
471
            
472
            # Create attachment
473
            attachment = pikepdf.AttachedFileSpec.from_filepath(
474
                pdf,
475
                str(file_path_obj),
476
                description=description,
477
                relationship=relationship
478
            )
479
            
480
            attachments[file_path_obj.name] = attachment
481
            print(f"Added to portfolio: {file_path_obj.name}")
482
    
483
    # Save portfolio
484
    pdf.save(output_path)
485
    pdf.close()
486
    
487
    print(f"Created portfolio PDF: {output_path}")
488

489
# Create a portfolio with multiple files
490
portfolio_files = [
491
    ('project_report.pdf', 'Main project report'),
492
    ('data_analysis.csv', 'Raw data and analysis'),
493
    ('chart.png', 'Key findings visualization'),
494
    ('source_code.py', 'Analysis script'),
495
    ('readme.txt', 'Project documentation')
496
]
497

498
# create_portfolio_pdf(portfolio_files, 'project_portfolio.pdf')
499
```
500

501
### Attachment Analysis and Reporting
502

503
```python
504
import pikepdf
505
from pathlib import Path
506
import hashlib
507

508
def analyze_pdf_attachments(pdf_path):
509
    """Comprehensive analysis of PDF attachments."""
510
    
511
    pdf = pikepdf.open(pdf_path)
512
    attachments = pdf.attachments
513
    
514
    analysis = {
515
        'total_attachments': len(attachments),
516
        'total_size': 0,
517
        'file_types': {},
518
        'relationships': {},
519
        'files': []
520
    }
521
    
522
    if analysis['total_attachments'] == 0:
523
        print(f"No attachments found in {pdf_path}")
524
        pdf.close()
525
        return analysis
526
    
527
    for name, attachment in attachments.items():
528
        try:
529
            # Basic file info
530
            filename = attachment.filename or name
531
            size = attachment.size if attachment.size >= 0 else 0
532
            
533
            # Extract file for analysis
534
            file_data = attachment.get_file()
535
            actual_size = len(file_data)
536
            
537
            # File type analysis
538
            file_extension = Path(filename).suffix.lower()
539
            if file_extension:
540
                analysis['file_types'][file_extension] = analysis['file_types'].get(file_extension, 0) + 1
541
            else:
542
                analysis['file_types']['(no extension)'] = analysis['file_types'].get('(no extension)', 0) + 1
543
            
544
            # Relationship analysis
545
            relationship = attachment.relationship
546
            analysis['relationships'][relationship] = analysis['relationships'].get(relationship, 0) + 1
547
            
548
            # Calculate checksums
549
            md5_hash = hashlib.md5(file_data).hexdigest().upper()
550
            sha256_hash = hashlib.sha256(file_data).hexdigest().upper()
551
            
552
            # File details
553
            file_info = {
554
                'name': filename,
555
                'attachment_key': name,
556
                'description': attachment.description,
557
                'size_reported': size,
558
                'size_actual': actual_size,
559
                'size_match': size == actual_size,
560
                'relationship': relationship,
561
                'creation_date': attachment.creation_date,
562
                'modification_date': attachment.modification_date,
563
                'checksum_reported': attachment.checksum,
564
                'checksum_md5': md5_hash,
565
                'checksum_sha256': sha256_hash,
566
                'checksum_verified': attachment.checksum.upper() == md5_hash if attachment.checksum else None,
567
                'file_extension': file_extension,
568
                'filenames_variants': attachment.get_all_filenames()
569
            }
570
            
571
            analysis['files'].append(file_info)
572
            analysis['total_size'] += actual_size
573
            
574
        except Exception as e:
575
            print(f"Error analyzing attachment '{name}': {e}")
576
    
577
    pdf.close()
578
    return analysis
579

580
def print_attachment_report(analysis):
581
    """Print formatted attachment analysis report."""
582
    
583
    print("PDF Attachment Analysis Report")
584
    print("=" * 50)
585
    
586
    print(f"Total Attachments: {analysis['total_attachments']}")
587
    print(f"Total Size: {analysis['total_size']:,} bytes ({analysis['total_size'] / 1024 / 1024:.2f} MB)")
588
    
589
    if analysis['file_types']:
590
        print(f"\nFile Types:")
591
        for ext, count in sorted(analysis['file_types'].items()):
592
            print(f"  {ext}: {count} files")
593
    
594
    if analysis['relationships']:
595
        print(f"\nFile Relationships:")
596
        for rel, count in sorted(analysis['relationships'].items()):
597
            print(f"  {rel}: {count} files")
598
    
599
    print(f"\nDetailed File Information:")
600
    print("-" * 50)
601
    
602
    for file_info in analysis['files']:
603
        print(f"\n📎 {file_info['name']}")
604
        print(f"   Key: {file_info['attachment_key']}")
605
        print(f"   Description: {file_info['description']}")
606
        print(f"   Size: {file_info['size_actual']:,} bytes", end="")
607
        
608
        if not file_info['size_match']:
609
            print(f" (reported: {file_info['size_reported']:,})", end="")
610
        print()
611
        
612
        print(f"   Type: {file_info['file_extension']}")
613
        print(f"   Relationship: {file_info['relationship']}")
614
        print(f"   Created: {file_info['creation_date']}")
615
        print(f"   Modified: {file_info['modification_date']}")
616
        
617
        # Checksum verification
618
        if file_info['checksum_reported']:
619
            verified = file_info['checksum_verified']
620
            status = "✓ Verified" if verified else "❌ Failed"
621
            print(f"   Checksum: {status} ({file_info['checksum_reported']})")
622
        else:
623
            print(f"   MD5: {file_info['checksum_md5']}")
624
        
625
        # Filename variants
626
        variants = file_info['filenames_variants']
627
        if len(variants) > 1:
628
            print(f"   Filename variants: {variants}")
629

630
# Analyze attachments
631
pdf_path = 'document_with_attachments.pdf'
632
if Path(pdf_path).exists():
633
    analysis = analyze_pdf_attachments(pdf_path)
634
    print_attachment_report(analysis)
635
```
636

637
### Bulk Attachment Operations
638

639
```python
640
import pikepdf
641
from pathlib import Path
642

643
def add_attachments_to_directory(directory_path, attachment_dir):
644
    """Add the same set of attachments to all PDFs in a directory."""
645
    
646
    directory = Path(directory_path)
647
    attachment_path = Path(attachment_dir)
648
    
649
    # Get list of files to attach
650
    attachment_files = list(attachment_path.glob('*'))
651
    attachment_files = [f for f in attachment_files if f.is_file()]
652
    
653
    if not attachment_files:
654
        print(f"No files found in {attachment_dir}")
655
        return
656
    
657
    # Get list of PDFs to process
658
    pdf_files = list(directory.glob('*.pdf'))
659
    
660
    results = {'success': [], 'failed': []}
661
    
662
    for pdf_file in pdf_files:
663
        try:
664
            pdf = pikepdf.open(pdf_file)
665
            attachments = pdf.attachments
666
            
667
            # Skip if already has attachments
668
            if len(attachments) > 0:
669
                print(f"Skipping {pdf_file.name} - already has attachments")
670
                pdf.close()
671
                continue
672
            
673
            # Add each attachment file
674
            attachments_added = 0
675
            for attach_file in attachment_files:
676
                try:
677
                    attachment = pikepdf.AttachedFileSpec.from_filepath(
678
                        pdf,
679
                        str(attach_file),
680
                        description=f"Standard attachment: {attach_file.name}",
681
                        relationship='/Supplement'
682
                    )
683
                    attachments[attach_file.name] = attachment
684
                    attachments_added += 1
685
                    
686
                except Exception as e:
687
                    print(f"Failed to attach {attach_file.name} to {pdf_file.name}: {e}")
688
            
689
            # Save if any attachments were added
690
            if attachments_added > 0:
691
                pdf.save()
692
                results['success'].append((pdf_file.name, attachments_added))
693
                print(f"Added {attachments_added} attachments to {pdf_file.name}")
694
            
695
            pdf.close()
696
            
697
        except Exception as e:
698
            results['failed'].append((pdf_file.name, str(e)))
699
            print(f"Failed to process {pdf_file.name}: {e}")
700
    
701
    print(f"\nBulk attachment complete:")
702
    print(f"  Success: {len(results['success'])} PDFs")
703
    print(f"  Failed: {len(results['failed'])} PDFs")
704

705
def remove_all_attachments(directory_path):
706
    """Remove all attachments from PDFs in a directory."""
707
    
708
    directory = Path(directory_path)
709
    pdf_files = list(directory.glob('*.pdf'))
710
    
711
    results = {'processed': 0, 'attachments_removed': 0, 'failed': []}
712
    
713
    for pdf_file in pdf_files:
714
        try:
715
            pdf = pikepdf.open(pdf_file)
716
            attachments = pdf.attachments
717
            
718
            attachment_count = len(attachments)
719
            
720
            if attachment_count > 0:
721
                # Clear all attachments
722
                attachments.clear()
723
                pdf.save()
724
                
725
                results['attachments_removed'] += attachment_count
726
                print(f"Removed {attachment_count} attachments from {pdf_file.name}")
727
            
728
            results['processed'] += 1
729
            pdf.close()
730
            
731
        except Exception as e:
732
            results['failed'].append((pdf_file.name, str(e)))
733
            print(f"Failed to process {pdf_file.name}: {e}")
734
    
735
    print(f"\nAttachment removal complete:")
736
    print(f"  PDFs processed: {results['processed']}")
737
    print(f"  Attachments removed: {results['attachments_removed']}")
738
    print(f"  Failed: {len(results['failed'])} PDFs")
739

740
# Example usage (commented out to avoid file operations)
741
# add_attachments_to_directory('./pdfs', './standard_attachments')
742
# remove_all_attachments('./pdfs')
743
```

Version

Tile

Files

attachments.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

attachments.mddocs/