0
# File Attachments
1
2
Management of embedded file attachments within PDF documents. The PdfAttachment class provides comprehensive access to file attachment metadata, data extraction, and modification capabilities.
3
4
## Capabilities
5
6
### Attachment Access
7
8
Access and enumerate file attachments within PDF documents.
9
10
```python { .api }
11
# Document-level attachment methods
12
def count_attachments(self) -> int:
13
"""Get total number of file attachments in document."""
14
15
def get_attachment(self, index: int) -> PdfAttachment:
16
"""
17
Get attachment by index.
18
19
Parameters:
20
- index: int, attachment index (0-based)
21
22
Returns:
23
PdfAttachment: Attachment object
24
"""
25
26
def new_attachment(self, name: str) -> PdfAttachment:
27
"""
28
Create new file attachment.
29
30
Parameters:
31
- name: str, attachment filename
32
33
Returns:
34
PdfAttachment: New attachment object (not yet added to document)
35
"""
36
37
def del_attachment(self, index: int):
38
"""
39
Delete attachment by index.
40
41
Parameters:
42
- index: int, attachment index to delete
43
"""
44
```
45
46
Basic attachment operations:
47
48
```python
49
import pypdfium2 as pdfium
50
51
pdf = pdfium.PdfDocument("document.pdf")
52
53
# Check for attachments
54
attachment_count = pdf.count_attachments()
55
print(f"Document has {attachment_count} attachments")
56
57
if attachment_count > 0:
58
# Process each attachment
59
for i in range(attachment_count):
60
attachment = pdf.get_attachment(i)
61
name = attachment.get_name()
62
print(f"Attachment {i}: {name}")
63
```
64
65
### Attachment Properties
66
67
Access attachment metadata and parent document reference.
68
69
```python { .api }
70
class PdfAttachment:
71
@property
72
def raw(self) -> FPDF_ATTACHMENT:
73
"""Raw PDFium attachment handle for low-level operations."""
74
75
@property
76
def pdf(self) -> PdfDocument:
77
"""Parent document containing this attachment."""
78
```
79
80
### File Data Management
81
82
Extract and modify attachment file data.
83
84
```python { .api }
85
def get_name(self) -> str:
86
"""
87
Get attachment filename.
88
89
Returns:
90
str: Original filename of the attached file
91
"""
92
93
def get_data(self) -> ctypes.Array:
94
"""
95
Get attachment file data.
96
97
Returns:
98
ctypes.Array: Raw file data as ctypes array
99
"""
100
101
def set_data(self, data):
102
"""
103
Set attachment file data.
104
105
Parameters:
106
- data: bytes or ctypes array containing new file data
107
"""
108
```
109
110
File data operations:
111
112
```python
113
pdf = pdfium.PdfDocument("document.pdf")
114
115
for i in range(pdf.count_attachments()):
116
attachment = pdf.get_attachment(i)
117
118
# Get attachment information
119
filename = attachment.get_name()
120
file_data = attachment.get_data()
121
122
print(f"Attachment: {filename}")
123
print(f"Size: {len(file_data)} bytes")
124
125
# Extract attachment to file
126
output_path = f"extracted_{filename}"
127
with open(output_path, "wb") as f:
128
f.write(bytes(file_data))
129
130
print(f"Extracted to: {output_path}")
131
```
132
133
### Metadata Management
134
135
Access and modify attachment metadata including custom properties.
136
137
```python { .api }
138
def has_key(self, key: str) -> bool:
139
"""
140
Check if metadata key exists.
141
142
Parameters:
143
- key: str, metadata key name
144
145
Returns:
146
bool: True if key exists, False otherwise
147
"""
148
149
def get_value_type(self, key: str) -> int:
150
"""
151
Get metadata value type.
152
153
Parameters:
154
- key: str, metadata key name
155
156
Returns:
157
int: PDFium value type constant
158
"""
159
160
def get_str_value(self, key: str) -> str:
161
"""
162
Get string metadata value.
163
164
Parameters:
165
- key: str, metadata key name
166
167
Returns:
168
str: Metadata value as string, empty if key doesn't exist
169
"""
170
171
def set_str_value(self, key: str, value: str):
172
"""
173
Set string metadata value.
174
175
Parameters:
176
- key: str, metadata key name
177
- value: str, metadata value to set
178
"""
179
```
180
181
Metadata operations:
182
183
```python
184
pdf = pdfium.PdfDocument("document.pdf")
185
attachment = pdf.get_attachment(0)
186
187
# Common metadata keys
188
metadata_keys = [
189
"Title", # File title/description
190
"Author", # File author
191
"Subject", # File subject
192
"Keywords", # File keywords
193
"Creator", # Creating application
194
"Producer", # PDF producer
195
"CreationDate", # Creation date
196
"ModDate" # Modification date
197
]
198
199
print(f"Attachment: {attachment.get_name()}")
200
print("Metadata:")
201
202
for key in metadata_keys:
203
if attachment.has_key(key):
204
value = attachment.get_str_value(key)
205
value_type = attachment.get_value_type(key)
206
print(f" {key}: {value} (type: {value_type})")
207
208
# Set custom metadata
209
attachment.set_str_value("CustomField", "Custom Value")
210
attachment.set_str_value("ExtractedBy", "pypdfium2")
211
212
# Verify changes
213
if attachment.has_key("CustomField"):
214
custom_value = attachment.get_str_value("CustomField")
215
print(f"Custom field: {custom_value}")
216
```
217
218
### Creating New Attachments
219
220
Add new file attachments to PDF documents.
221
222
```python
223
def add_file_attachment(pdf, file_path, attachment_name=None):
224
"""Add file as attachment to PDF document."""
225
import os
226
227
# Use filename if no attachment name provided
228
if attachment_name is None:
229
attachment_name = os.path.basename(file_path)
230
231
# Create new attachment
232
attachment = pdf.new_attachment(attachment_name)
233
234
# Read file data
235
with open(file_path, "rb") as f:
236
file_data = f.read()
237
238
# Set attachment data
239
attachment.set_data(file_data)
240
241
# Set metadata
242
attachment.set_str_value("Title", attachment_name)
243
attachment.set_str_value("CreationDate", "D:20240101120000")
244
attachment.set_str_value("ModDate", "D:20240101120000")
245
246
print(f"Added attachment: {attachment_name} ({len(file_data)} bytes)")
247
248
return attachment
249
250
# Usage
251
pdf = pdfium.PdfDocument("document.pdf")
252
253
# Add a text file as attachment
254
add_file_attachment(pdf, "readme.txt", "README")
255
256
# Add an image as attachment
257
add_file_attachment(pdf, "chart.png", "Chart Image")
258
259
# Save document with new attachments
260
pdf.save("document_with_attachments.pdf")
261
```
262
263
### Attachment Analysis
264
265
Analyze and report on document attachments.
266
267
```python
268
def analyze_attachments(pdf):
269
"""Comprehensive attachment analysis."""
270
271
count = pdf.count_attachments()
272
273
if count == 0:
274
print("Document contains no attachments")
275
return
276
277
print(f"Document contains {count} attachment(s)")
278
279
total_size = 0
280
file_types = {}
281
282
for i in range(count):
283
attachment = pdf.get_attachment(i)
284
285
# Basic information
286
name = attachment.get_name()
287
data = attachment.get_data()
288
size = len(data)
289
total_size += size
290
291
# File extension analysis
292
ext = name.split('.')[-1].lower() if '.' in name else 'no_ext'
293
file_types[ext] = file_types.get(ext, 0) + 1
294
295
print(f"\nAttachment {i+1}: {name}")
296
print(f" Size: {size:,} bytes ({size/1024:.1f} KB)")
297
298
# Analyze metadata
299
metadata_keys = ["Title", "Author", "Subject", "CreationDate", "ModDate"]
300
metadata_found = False
301
302
for key in metadata_keys:
303
if attachment.has_key(key):
304
value = attachment.get_str_value(key)
305
if value:
306
if not metadata_found:
307
print(" Metadata:")
308
metadata_found = True
309
print(f" {key}: {value}")
310
311
if not metadata_found:
312
print(" No metadata found")
313
314
# File type detection (basic)
315
file_signature = bytes(data[:16])
316
if file_signature.startswith(b'\xFF\xD8\xFF'):
317
print(" Detected: JPEG image")
318
elif file_signature.startswith(b'\x89PNG'):
319
print(" Detected: PNG image")
320
elif file_signature.startswith(b'%PDF'):
321
print(" Detected: PDF document")
322
elif file_signature.startswith(b'PK'):
323
print(" Detected: ZIP archive or Office document")
324
325
# Summary
326
print(f"\nSummary:")
327
print(f" Total attachments: {count}")
328
print(f" Total size: {total_size:,} bytes ({total_size/1024:.1f} KB)")
329
print(f" File types: {dict(file_types)}")
330
331
# Usage
332
pdf = pdfium.PdfDocument("document.pdf")
333
analyze_attachments(pdf)
334
```
335
336
### Batch Attachment Processing
337
338
Process multiple attachments efficiently.
339
340
```python
341
def extract_all_attachments(pdf, output_dir):
342
"""Extract all attachments to specified directory."""
343
import os
344
345
os.makedirs(output_dir, exist_ok=True)
346
347
count = pdf.count_attachments()
348
if count == 0:
349
print("No attachments to extract")
350
return
351
352
extracted = 0
353
failed = 0
354
355
for i in range(count):
356
try:
357
attachment = pdf.get_attachment(i)
358
name = attachment.get_name()
359
data = attachment.get_data()
360
361
# Sanitize filename
362
safe_name = "".join(c for c in name if c.isalnum() or c in "._- ")
363
if not safe_name:
364
safe_name = f"attachment_{i}"
365
366
output_path = os.path.join(output_dir, safe_name)
367
368
# Handle filename conflicts
369
counter = 1
370
original_path = output_path
371
while os.path.exists(output_path):
372
name_parts = original_path.rsplit('.', 1)
373
if len(name_parts) == 2:
374
output_path = f"{name_parts[0]}_{counter}.{name_parts[1]}"
375
else:
376
output_path = f"{original_path}_{counter}"
377
counter += 1
378
379
# Write file
380
with open(output_path, "wb") as f:
381
f.write(bytes(data))
382
383
print(f"Extracted: {name} -> {output_path}")
384
extracted += 1
385
386
except Exception as e:
387
print(f"Failed to extract attachment {i}: {e}")
388
failed += 1
389
390
print(f"\nExtraction complete: {extracted} successful, {failed} failed")
391
return extracted, failed
392
393
# Usage
394
pdf = pdfium.PdfDocument("document.pdf")
395
extract_all_attachments(pdf, "extracted_attachments")
396
```
397
398
### Attachment Security
399
400
Handle attachment security and validation.
401
402
```python
403
def validate_attachments(pdf, max_size_mb=10, allowed_extensions=None):
404
"""Validate attachments for security and size constraints."""
405
406
if allowed_extensions is None:
407
allowed_extensions = {'.txt', '.pdf', '.jpg', '.png', '.gif', '.doc', '.docx'}
408
409
count = pdf.count_attachments()
410
issues = []
411
412
for i in range(count):
413
attachment = pdf.get_attachment(i)
414
name = attachment.get_name()
415
data = attachment.get_data()
416
size_mb = len(data) / (1024 * 1024)
417
418
# Size check
419
if size_mb > max_size_mb:
420
issues.append(f"Attachment {i} '{name}': Size {size_mb:.1f}MB exceeds limit {max_size_mb}MB")
421
422
# Extension check
423
ext = '.' + name.split('.')[-1].lower() if '.' in name else ''
424
if ext not in allowed_extensions:
425
issues.append(f"Attachment {i} '{name}': Extension '{ext}' not allowed")
426
427
# Basic content validation
428
file_data = bytes(data[:16])
429
if ext in ['.jpg', '.jpeg'] and not file_data.startswith(b'\xFF\xD8\xFF'):
430
issues.append(f"Attachment {i} '{name}': JPEG header mismatch")
431
elif ext == '.png' and not file_data.startswith(b'\x89PNG'):
432
issues.append(f"Attachment {i} '{name}': PNG header mismatch")
433
elif ext == '.pdf' and not file_data.startswith(b'%PDF'):
434
issues.append(f"Attachment {i} '{name}': PDF header mismatch")
435
436
if issues:
437
print("Attachment validation issues:")
438
for issue in issues:
439
print(f" - {issue}")
440
return False
441
else:
442
print(f"All {count} attachments passed validation")
443
return True
444
445
# Usage
446
pdf = pdfium.PdfDocument("document.pdf")
447
is_valid = validate_attachments(pdf, max_size_mb=5)
448
```
449
450
## Common Attachment Operations
451
452
### Attachment Backup
453
454
```python
455
def backup_attachments(pdf, backup_path):
456
"""Create backup of all attachments as ZIP file."""
457
import zipfile
458
import io
459
460
count = pdf.count_attachments()
461
if count == 0:
462
return False
463
464
with zipfile.ZipFile(backup_path, 'w', zipfile.ZIP_DEFLATED) as zf:
465
for i in range(count):
466
attachment = pdf.get_attachment(i)
467
name = attachment.get_name()
468
data = bytes(attachment.get_data())
469
470
# Add to ZIP with metadata
471
zf.writestr(name, data)
472
473
print(f"Backed up {count} attachments to {backup_path}")
474
return True
475
476
# Usage
477
pdf = pdfium.PdfDocument("document.pdf")
478
backup_attachments(pdf, "attachments_backup.zip")
479
```