0
# File Attachments
1
2
Embedded file management including attachment, extraction, and metadata handling for portfolio PDFs and file attachments. These capabilities enable comprehensive file embedding and management within PDF documents.
3
4
## Capabilities
5
6
### AttachedFileSpec Class
7
8
Individual file attachment specifications with metadata and content management.
9
10
```python { .api }
11
class AttachedFileSpec:
12
"""
13
PDF attached file specification for embedded files.
14
15
Represents a single file embedded within a PDF document,
16
including its content, metadata, and relationship to the document.
17
"""
18
19
@staticmethod
20
def from_filepath(pdf: Pdf, path: str, *, description: str = '',
21
relationship: str = '/Unspecified') -> AttachedFileSpec:
22
"""
23
Create an attached file specification from a file path.
24
25
Reads the file from disk and creates a complete attachment
26
specification with appropriate metadata and content encoding.
27
28
Parameters:
29
- pdf (Pdf): PDF document to attach the file to
30
- path (str): Path to the file to attach
31
- description (str): Human-readable description of the file
32
- relationship (str): Relationship to the document
33
('/Source', '/Data', '/Alternative', '/Supplement', '/Unspecified')
34
35
Returns:
36
AttachedFileSpec: Attached file specification ready for embedding
37
38
Raises:
39
FileNotFoundError: If the specified file doesn't exist
40
IOError: If the file cannot be read
41
"""
42
43
def get_file(self) -> bytes:
44
"""
45
Retrieve the attached file's content as bytes.
46
47
Extracts and decodes the embedded file data from the PDF.
48
49
Returns:
50
bytes: Complete file content
51
52
Raises:
53
DataDecodingError: If file data cannot be decoded
54
"""
55
56
def get_all_filenames(self) -> dict[str, str]:
57
"""
58
Get all filename variants for this attachment.
59
60
PDF attachments can have multiple filename variants for
61
different platforms and character encodings.
62
63
Returns:
64
dict[str, str]: Mapping of filename types to actual filenames
65
Keys: 'F', 'UF', 'DOS', 'Mac', 'Unix'
66
"""
67
68
@property
69
def filename(self) -> str:
70
"""
71
Primary filename for the attached file.
72
73
Returns the most appropriate filename, preferring Unicode
74
filenames when available.
75
76
Returns:
77
str: Filename of the attached file
78
"""
79
80
@property
81
def description(self) -> str:
82
"""
83
Human-readable description of the attached file.
84
85
Returns:
86
str: File description or empty string if none provided
87
"""
88
89
@property
90
def relationship(self) -> str:
91
"""
92
Relationship of this file to the PDF document.
93
94
Common values:
95
- '/Source': Original source file for the PDF
96
- '/Data': Data file related to the PDF content
97
- '/Alternative': Alternative representation
98
- '/Supplement': Supplementary file
99
- '/Unspecified': Relationship not specified
100
101
Returns:
102
str: Relationship type as PDF name
103
"""
104
105
@property
106
def size(self) -> int:
107
"""
108
Size of the attached file in bytes.
109
110
Returns:
111
int: File size, or -1 if size is unknown
112
"""
113
114
@property
115
def creation_date(self) -> str:
116
"""
117
Creation date of the attached file.
118
119
Returns:
120
str: Creation date in PDF date format, or empty if unknown
121
"""
122
123
@property
124
def modification_date(self) -> str:
125
"""
126
Last modification date of the attached file.
127
128
Returns:
129
str: Modification date in PDF date format, or empty if unknown
130
"""
131
132
@property
133
def checksum(self) -> str:
134
"""
135
MD5 checksum of the attached file content.
136
137
Used for integrity verification of the embedded file.
138
139
Returns:
140
str: Hex-encoded MD5 hash, or empty if not available
141
"""
142
```
143
144
### Attachments Class
145
146
Collection interface for managing all attachments in a PDF document.
147
148
```python { .api }
149
class Attachments:
150
"""
151
Mapping interface for PDF attachments collection.
152
153
Provides dictionary-like access to all embedded files in a PDF,
154
with methods for adding, removing, and iterating attachments.
155
156
Implements MutableMapping[str, AttachedFileSpec] interface.
157
"""
158
159
def __len__(self) -> int:
160
"""
161
Number of attached files in the PDF.
162
163
Returns:
164
int: Count of embedded files
165
"""
166
167
def __iter__(self) -> Iterator[str]:
168
"""
169
Iterate over attachment names.
170
171
Yields:
172
str: Filename/key for each attached file
173
"""
174
175
def __getitem__(self, key: str) -> AttachedFileSpec:
176
"""
177
Get an attached file by name.
178
179
Parameters:
180
- key (str): Attachment filename or key
181
182
Returns:
183
AttachedFileSpec: Attached file specification
184
185
Raises:
186
KeyError: If attachment with specified key doesn't exist
187
"""
188
189
def __setitem__(self, key: str, value: AttachedFileSpec) -> None:
190
"""
191
Add or replace an attached file.
192
193
Parameters:
194
- key (str): Attachment name/key
195
- value (AttachedFileSpec): File specification to attach
196
"""
197
198
def __delitem__(self, key: str) -> None:
199
"""
200
Remove an attached file.
201
202
Parameters:
203
- key (str): Attachment name/key to remove
204
205
Raises:
206
KeyError: If attachment doesn't exist
207
"""
208
209
def __contains__(self, key: str) -> bool:
210
"""
211
Check if an attachment exists.
212
213
Parameters:
214
- key (str): Attachment name/key to check
215
216
Returns:
217
bool: True if attachment exists
218
"""
219
220
def keys(self):
221
"""
222
Get all attachment names.
223
224
Returns:
225
KeysView: View of all attachment keys
226
"""
227
228
def values(self):
229
"""
230
Get all attachment specifications.
231
232
Returns:
233
ValuesView: View of all AttachedFileSpec objects
234
"""
235
236
def items(self):
237
"""
238
Get all attachment name-specification pairs.
239
240
Returns:
241
ItemsView: View of (key, AttachedFileSpec) pairs
242
"""
243
244
def clear(self) -> None:
245
"""Remove all attachments from the PDF."""
246
```
247
248
## Usage Examples
249
250
### Adding File Attachments
251
252
```python
253
import pikepdf
254
from pathlib import Path
255
256
# Open or create a PDF
257
pdf = pikepdf.open('document.pdf')
258
259
# Access the attachments collection
260
attachments = pdf.attachments
261
262
# Attach a file from disk
263
document_file = Path('source_document.docx')
264
if document_file.exists():
265
# Create attachment specification
266
attachment = pikepdf.AttachedFileSpec.from_filepath(
267
pdf,
268
str(document_file),
269
description="Original Word document source",
270
relationship='/Source'
271
)
272
273
# Add to PDF
274
attachments['source_document.docx'] = attachment
275
print(f"Attached: {document_file.name}")
276
277
# Attach multiple files
278
files_to_attach = [
279
('data.csv', 'Supporting data file', '/Data'),
280
('image.png', 'Illustration used in document', '/Supplement'),
281
('readme.txt', 'Instructions and notes', '/Unspecified')
282
]
283
284
for filename, description, relationship in files_to_attach:
285
file_path = Path(filename)
286
if file_path.exists():
287
attachment = pikepdf.AttachedFileSpec.from_filepath(
288
pdf,
289
str(file_path),
290
description=description,
291
relationship=relationship
292
)
293
attachments[filename] = attachment
294
print(f"Attached: {filename} ({description})")
295
296
print(f"Total attachments: {len(attachments)}")
297
298
# Save PDF with attachments
299
pdf.save('document_with_attachments.pdf')
300
pdf.close()
301
```
302
303
### Extracting Attached Files
304
305
```python
306
import pikepdf
307
from pathlib import Path
308
309
def extract_all_attachments(pdf_path, output_dir):
310
"""Extract all attached files from a PDF."""
311
312
pdf = pikepdf.open(pdf_path)
313
attachments = pdf.attachments
314
315
if len(attachments) == 0:
316
print("No attachments found in PDF")
317
pdf.close()
318
return
319
320
# Create output directory
321
output_path = Path(output_dir)
322
output_path.mkdir(exist_ok=True)
323
324
extracted_files = []
325
326
print(f"Found {len(attachments)} attachments:")
327
328
for name, attachment in attachments.items():
329
try:
330
# Get file info
331
filename = attachment.filename or name
332
description = attachment.description
333
size = attachment.size
334
relationship = attachment.relationship
335
336
print(f"\nπ {filename}")
337
print(f" Description: {description}")
338
print(f" Size: {size:,} bytes" if size >= 0 else " Size: Unknown")
339
print(f" Relationship: {relationship}")
340
print(f" Created: {attachment.creation_date}")
341
print(f" Modified: {attachment.modification_date}")
342
343
# Extract file content
344
file_data = attachment.get_file()
345
346
# Save to disk
347
safe_filename = "".join(c for c in filename if c.isalnum() or c in '.-_')
348
output_file = output_path / safe_filename
349
350
# Handle filename conflicts
351
counter = 1
352
while output_file.exists():
353
stem = output_file.stem
354
suffix = output_file.suffix
355
output_file = output_path / f"{stem}_{counter}{suffix}"
356
counter += 1
357
358
with open(output_file, 'wb') as f:
359
f.write(file_data)
360
361
extracted_files.append(str(output_file))
362
print(f" β Extracted to: {output_file}")
363
364
# Verify checksum if available
365
if attachment.checksum:
366
import hashlib
367
actual_checksum = hashlib.md5(file_data).hexdigest().upper()
368
expected_checksum = attachment.checksum.upper()
369
370
if actual_checksum == expected_checksum:
371
print(f" β Checksum verified: {actual_checksum}")
372
else:
373
print(f" β οΈ Checksum mismatch: expected {expected_checksum}, got {actual_checksum}")
374
375
except Exception as e:
376
print(f" β Error extracting {name}: {e}")
377
378
pdf.close()
379
380
print(f"\nExtracted {len(extracted_files)} files to {output_dir}")
381
return extracted_files
382
383
# Extract attachments
384
extracted = extract_all_attachments('document_with_attachments.pdf', 'extracted_files')
385
```
386
387
### Managing Attachment Metadata
388
389
```python
390
import pikepdf
391
from datetime import datetime
392
393
def update_attachment_metadata(pdf_path):
394
"""Update metadata for existing attachments."""
395
396
pdf = pikepdf.open(pdf_path)
397
attachments = pdf.attachments
398
399
for name, attachment in attachments.items():
400
print(f"Attachment: {name}")
401
402
# Get all filename variants
403
filenames = attachment.get_all_filenames()
404
print(f" Filename variants: {filenames}")
405
406
# Display current metadata
407
print(f" Current description: '{attachment.description}'")
408
print(f" Current relationship: {attachment.relationship}")
409
print(f" File size: {attachment.size:,} bytes")
410
print(f" Creation date: {attachment.creation_date}")
411
print(f" Modification date: {attachment.modification_date}")
412
print(f" Checksum: {attachment.checksum}")
413
414
# Note: Modifying attachment metadata requires recreating the attachment
415
# This is a limitation of the PDF format and pikepdf's current API
416
417
pdf.close()
418
419
def create_portfolio_pdf(file_list, output_path):
420
"""Create a PDF portfolio with multiple attached files."""
421
422
# Create new PDF
423
pdf = pikepdf.new()
424
425
# Add a cover page
426
page = pdf.add_blank_page()
427
428
# Add basic content to cover page
429
content = f"""
430
BT
431
/F1 24 Tf
432
100 700 Td
433
(PDF Portfolio) Tj
434
435
/F1 12 Tf
436
100 650 Td
437
(This PDF contains {len(file_list)} attached files:) Tj
438
"""
439
440
y_pos = 620
441
for i, (file_path, description) in enumerate(file_list):
442
file_name = Path(file_path).name
443
content += f"""
444
100 {y_pos} Td
445
({i+1}. {file_name}) Tj
446
"""
447
y_pos -= 20
448
449
content += "\nET"
450
451
content_stream = pikepdf.Stream(pdf, content.encode())
452
page['/Contents'] = content_stream
453
454
# Add files as attachments
455
attachments = pdf.attachments
456
457
for file_path, description in file_list:
458
file_path_obj = Path(file_path)
459
460
if file_path_obj.exists():
461
# Determine relationship based on file type
462
suffix = file_path_obj.suffix.lower()
463
if suffix in ['.docx', '.doc', '.odt']:
464
relationship = '/Source'
465
elif suffix in ['.csv', '.xlsx', '.json']:
466
relationship = '/Data'
467
elif suffix in ['.png', '.jpg', '.jpeg', '.gif']:
468
relationship = '/Supplement'
469
else:
470
relationship = '/Unspecified'
471
472
# Create attachment
473
attachment = pikepdf.AttachedFileSpec.from_filepath(
474
pdf,
475
str(file_path_obj),
476
description=description,
477
relationship=relationship
478
)
479
480
attachments[file_path_obj.name] = attachment
481
print(f"Added to portfolio: {file_path_obj.name}")
482
483
# Save portfolio
484
pdf.save(output_path)
485
pdf.close()
486
487
print(f"Created portfolio PDF: {output_path}")
488
489
# Create a portfolio with multiple files
490
portfolio_files = [
491
('project_report.pdf', 'Main project report'),
492
('data_analysis.csv', 'Raw data and analysis'),
493
('chart.png', 'Key findings visualization'),
494
('source_code.py', 'Analysis script'),
495
('readme.txt', 'Project documentation')
496
]
497
498
# create_portfolio_pdf(portfolio_files, 'project_portfolio.pdf')
499
```
500
501
### Attachment Analysis and Reporting
502
503
```python
504
import pikepdf
505
from pathlib import Path
506
import hashlib
507
508
def analyze_pdf_attachments(pdf_path):
509
"""Comprehensive analysis of PDF attachments."""
510
511
pdf = pikepdf.open(pdf_path)
512
attachments = pdf.attachments
513
514
analysis = {
515
'total_attachments': len(attachments),
516
'total_size': 0,
517
'file_types': {},
518
'relationships': {},
519
'files': []
520
}
521
522
if analysis['total_attachments'] == 0:
523
print(f"No attachments found in {pdf_path}")
524
pdf.close()
525
return analysis
526
527
for name, attachment in attachments.items():
528
try:
529
# Basic file info
530
filename = attachment.filename or name
531
size = attachment.size if attachment.size >= 0 else 0
532
533
# Extract file for analysis
534
file_data = attachment.get_file()
535
actual_size = len(file_data)
536
537
# File type analysis
538
file_extension = Path(filename).suffix.lower()
539
if file_extension:
540
analysis['file_types'][file_extension] = analysis['file_types'].get(file_extension, 0) + 1
541
else:
542
analysis['file_types']['(no extension)'] = analysis['file_types'].get('(no extension)', 0) + 1
543
544
# Relationship analysis
545
relationship = attachment.relationship
546
analysis['relationships'][relationship] = analysis['relationships'].get(relationship, 0) + 1
547
548
# Calculate checksums
549
md5_hash = hashlib.md5(file_data).hexdigest().upper()
550
sha256_hash = hashlib.sha256(file_data).hexdigest().upper()
551
552
# File details
553
file_info = {
554
'name': filename,
555
'attachment_key': name,
556
'description': attachment.description,
557
'size_reported': size,
558
'size_actual': actual_size,
559
'size_match': size == actual_size,
560
'relationship': relationship,
561
'creation_date': attachment.creation_date,
562
'modification_date': attachment.modification_date,
563
'checksum_reported': attachment.checksum,
564
'checksum_md5': md5_hash,
565
'checksum_sha256': sha256_hash,
566
'checksum_verified': attachment.checksum.upper() == md5_hash if attachment.checksum else None,
567
'file_extension': file_extension,
568
'filenames_variants': attachment.get_all_filenames()
569
}
570
571
analysis['files'].append(file_info)
572
analysis['total_size'] += actual_size
573
574
except Exception as e:
575
print(f"Error analyzing attachment '{name}': {e}")
576
577
pdf.close()
578
return analysis
579
580
def print_attachment_report(analysis):
581
"""Print formatted attachment analysis report."""
582
583
print("PDF Attachment Analysis Report")
584
print("=" * 50)
585
586
print(f"Total Attachments: {analysis['total_attachments']}")
587
print(f"Total Size: {analysis['total_size']:,} bytes ({analysis['total_size'] / 1024 / 1024:.2f} MB)")
588
589
if analysis['file_types']:
590
print(f"\nFile Types:")
591
for ext, count in sorted(analysis['file_types'].items()):
592
print(f" {ext}: {count} files")
593
594
if analysis['relationships']:
595
print(f"\nFile Relationships:")
596
for rel, count in sorted(analysis['relationships'].items()):
597
print(f" {rel}: {count} files")
598
599
print(f"\nDetailed File Information:")
600
print("-" * 50)
601
602
for file_info in analysis['files']:
603
print(f"\nπ {file_info['name']}")
604
print(f" Key: {file_info['attachment_key']}")
605
print(f" Description: {file_info['description']}")
606
print(f" Size: {file_info['size_actual']:,} bytes", end="")
607
608
if not file_info['size_match']:
609
print(f" (reported: {file_info['size_reported']:,})", end="")
610
print()
611
612
print(f" Type: {file_info['file_extension']}")
613
print(f" Relationship: {file_info['relationship']}")
614
print(f" Created: {file_info['creation_date']}")
615
print(f" Modified: {file_info['modification_date']}")
616
617
# Checksum verification
618
if file_info['checksum_reported']:
619
verified = file_info['checksum_verified']
620
status = "β Verified" if verified else "β Failed"
621
print(f" Checksum: {status} ({file_info['checksum_reported']})")
622
else:
623
print(f" MD5: {file_info['checksum_md5']}")
624
625
# Filename variants
626
variants = file_info['filenames_variants']
627
if len(variants) > 1:
628
print(f" Filename variants: {variants}")
629
630
# Analyze attachments
631
pdf_path = 'document_with_attachments.pdf'
632
if Path(pdf_path).exists():
633
analysis = analyze_pdf_attachments(pdf_path)
634
print_attachment_report(analysis)
635
```
636
637
### Bulk Attachment Operations
638
639
```python
640
import pikepdf
641
from pathlib import Path
642
643
def add_attachments_to_directory(directory_path, attachment_dir):
644
"""Add the same set of attachments to all PDFs in a directory."""
645
646
directory = Path(directory_path)
647
attachment_path = Path(attachment_dir)
648
649
# Get list of files to attach
650
attachment_files = list(attachment_path.glob('*'))
651
attachment_files = [f for f in attachment_files if f.is_file()]
652
653
if not attachment_files:
654
print(f"No files found in {attachment_dir}")
655
return
656
657
# Get list of PDFs to process
658
pdf_files = list(directory.glob('*.pdf'))
659
660
results = {'success': [], 'failed': []}
661
662
for pdf_file in pdf_files:
663
try:
664
pdf = pikepdf.open(pdf_file)
665
attachments = pdf.attachments
666
667
# Skip if already has attachments
668
if len(attachments) > 0:
669
print(f"Skipping {pdf_file.name} - already has attachments")
670
pdf.close()
671
continue
672
673
# Add each attachment file
674
attachments_added = 0
675
for attach_file in attachment_files:
676
try:
677
attachment = pikepdf.AttachedFileSpec.from_filepath(
678
pdf,
679
str(attach_file),
680
description=f"Standard attachment: {attach_file.name}",
681
relationship='/Supplement'
682
)
683
attachments[attach_file.name] = attachment
684
attachments_added += 1
685
686
except Exception as e:
687
print(f"Failed to attach {attach_file.name} to {pdf_file.name}: {e}")
688
689
# Save if any attachments were added
690
if attachments_added > 0:
691
pdf.save()
692
results['success'].append((pdf_file.name, attachments_added))
693
print(f"Added {attachments_added} attachments to {pdf_file.name}")
694
695
pdf.close()
696
697
except Exception as e:
698
results['failed'].append((pdf_file.name, str(e)))
699
print(f"Failed to process {pdf_file.name}: {e}")
700
701
print(f"\nBulk attachment complete:")
702
print(f" Success: {len(results['success'])} PDFs")
703
print(f" Failed: {len(results['failed'])} PDFs")
704
705
def remove_all_attachments(directory_path):
706
"""Remove all attachments from PDFs in a directory."""
707
708
directory = Path(directory_path)
709
pdf_files = list(directory.glob('*.pdf'))
710
711
results = {'processed': 0, 'attachments_removed': 0, 'failed': []}
712
713
for pdf_file in pdf_files:
714
try:
715
pdf = pikepdf.open(pdf_file)
716
attachments = pdf.attachments
717
718
attachment_count = len(attachments)
719
720
if attachment_count > 0:
721
# Clear all attachments
722
attachments.clear()
723
pdf.save()
724
725
results['attachments_removed'] += attachment_count
726
print(f"Removed {attachment_count} attachments from {pdf_file.name}")
727
728
results['processed'] += 1
729
pdf.close()
730
731
except Exception as e:
732
results['failed'].append((pdf_file.name, str(e)))
733
print(f"Failed to process {pdf_file.name}: {e}")
734
735
print(f"\nAttachment removal complete:")
736
print(f" PDFs processed: {results['processed']}")
737
print(f" Attachments removed: {results['attachments_removed']}")
738
print(f" Failed: {len(results['failed'])} PDFs")
739
740
# Example usage (commented out to avoid file operations)
741
# add_attachments_to_directory('./pdfs', './standard_attachments')
742
# remove_all_attachments('./pdfs')
743
```