0
# Metadata and Document Properties
1
2
Document metadata, XMP data, and PDF properties including titles, authors, creation dates, and custom metadata fields. These capabilities enable comprehensive document information management and standards compliance.
3
4
## Capabilities
5
6
### PdfMetadata Class
7
8
Comprehensive XMP metadata management with PDF/A compliance and standards support.
9
10
```python { .api }
11
class PdfMetadata:
12
"""
13
XMP metadata handler for PDF documents.
14
15
Provides access to document metadata following the XMP (Extensible Metadata Platform)
16
standard, with support for Dublin Core, PDF, and custom metadata schemas.
17
"""
18
19
def __init__(self, pdf: Pdf, *, sync_docinfo: bool = True) -> None:
20
"""
21
Create a metadata handler for a PDF document.
22
23
Parameters:
24
- pdf (Pdf): PDF document to manage metadata for
25
- sync_docinfo (bool): Automatically synchronize with document info dictionary
26
27
Raises:
28
DependencyError: If required XMP libraries are not available
29
"""
30
31
@property
32
def pdfa_status(self) -> str:
33
"""
34
PDF/A compliance status of the document.
35
36
Returns:
37
str: PDF/A status ('1A', '1B', '2A', '2B', '2U', '3A', '3B', '3U', or empty if not PDF/A)
38
"""
39
40
def load_from_docinfo(self, docinfo: Dictionary, *, delete_missing: bool = False) -> None:
41
"""
42
Load metadata from a document info dictionary.
43
44
Parameters:
45
- docinfo (Dictionary): Document info dictionary to load from
46
- delete_missing (bool): Delete existing metadata not found in docinfo
47
"""
48
49
def save_to_docinfo(self, docinfo: Dictionary) -> None:
50
"""
51
Save metadata to a document info dictionary.
52
53
Parameters:
54
- docinfo (Dictionary): Document info dictionary to update
55
"""
56
57
@property
58
def title(self) -> str:
59
"""
60
Document title.
61
62
Returns:
63
str: Title of the document
64
"""
65
66
@title.setter
67
def title(self, value: str) -> None:
68
"""Set document title."""
69
70
@property
71
def author(self) -> str:
72
"""
73
Document author.
74
75
Returns:
76
str: Author name or names
77
"""
78
79
@author.setter
80
def author(self, value: str) -> None:
81
"""Set document author."""
82
83
@property
84
def subject(self) -> str:
85
"""
86
Document subject or description.
87
88
Returns:
89
str: Subject description
90
"""
91
92
@subject.setter
93
def subject(self, value: str) -> None:
94
"""Set document subject."""
95
96
@property
97
def keywords(self) -> str:
98
"""
99
Document keywords.
100
101
Returns:
102
str: Keywords (typically comma-separated)
103
"""
104
105
@keywords.setter
106
def keywords(self, value: str) -> None:
107
"""Set document keywords."""
108
109
@property
110
def creator(self) -> str:
111
"""
112
Application that created the original document.
113
114
Returns:
115
str: Name of creating application
116
"""
117
118
@creator.setter
119
def creator(self, value: str) -> None:
120
"""Set document creator."""
121
122
@property
123
def producer(self) -> str:
124
"""
125
Application that converted/produced the PDF.
126
127
Returns:
128
str: Name of PDF producing application
129
"""
130
131
@producer.setter
132
def producer(self, value: str) -> None:
133
"""Set document producer."""
134
135
@property
136
def creation_date(self) -> str:
137
"""
138
Document creation date in ISO format.
139
140
Returns:
141
str: Creation date (ISO 8601 format)
142
"""
143
144
@creation_date.setter
145
def creation_date(self, value: str) -> None:
146
"""Set document creation date."""
147
148
@property
149
def modification_date(self) -> str:
150
"""
151
Document modification date in ISO format.
152
153
Returns:
154
str: Last modification date (ISO 8601 format)
155
"""
156
157
@modification_date.setter
158
def modification_date(self, value: str) -> None:
159
"""Set document modification date."""
160
```
161
162
### Document Info Dictionary Access
163
164
Direct access to PDF document information dictionary for legacy metadata.
165
166
```python { .api }
167
# Accessed via pdf.docinfo property
168
class DocumentInfo(Dictionary):
169
"""
170
PDF document information dictionary.
171
172
Legacy metadata storage using PDF's built-in document info dictionary.
173
Modern documents should use XMP metadata, but this provides compatibility.
174
"""
175
176
# Standard document info entries (accessed as dictionary keys):
177
# '/Title': Document title
178
# '/Author': Document author
179
# '/Subject': Document subject
180
# '/Keywords': Document keywords
181
# '/Creator': Creating application
182
# '/Producer': PDF producer application
183
# '/CreationDate': Creation date (PDF date format)
184
# '/ModDate': Modification date (PDF date format)
185
# '/Trapped': Trapping status (/True, /False, /Unknown)
186
```
187
188
### Metadata Exceptions
189
190
Specialized exceptions for metadata operations.
191
192
```python { .api }
193
class DependencyError(Exception):
194
"""
195
Raised when required metadata processing libraries are missing.
196
197
Metadata operations may require additional Python packages
198
for XMP processing and date handling.
199
"""
200
```
201
202
## Usage Examples
203
204
### Basic Metadata Operations
205
206
```python
207
import pikepdf
208
from datetime import datetime
209
210
# Open or create a PDF
211
pdf = pikepdf.open('document.pdf')
212
213
# Access document info dictionary (legacy metadata)
214
docinfo = pdf.docinfo
215
216
# Read existing metadata
217
print("Current metadata:")
218
print(f"Title: {docinfo.get('/Title', 'No title')}")
219
print(f"Author: {docinfo.get('/Author', 'No author')}")
220
print(f"Subject: {docinfo.get('/Subject', 'No subject')}")
221
print(f"Keywords: {docinfo.get('/Keywords', 'No keywords')}")
222
print(f"Creator: {docinfo.get('/Creator', 'No creator')}")
223
print(f"Producer: {docinfo.get('/Producer', 'No producer')}")
224
225
# Update metadata
226
docinfo['/Title'] = pikepdf.String('Updated Document Title')
227
docinfo['/Author'] = pikepdf.String('Jane Doe')
228
docinfo['/Subject'] = pikepdf.String('Technical Documentation')
229
docinfo['/Keywords'] = pikepdf.String('PDF, documentation, technical, guide')
230
docinfo['/Creator'] = pikepdf.String('Python Script')
231
232
# Set creation and modification dates
233
current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")
234
docinfo['/CreationDate'] = pikepdf.String(current_date)
235
docinfo['/ModDate'] = pikepdf.String(current_date)
236
237
pdf.save('updated_metadata.pdf')
238
pdf.close()
239
```
240
241
### Working with XMP Metadata
242
243
```python
244
import pikepdf
245
from datetime import datetime
246
247
# Open PDF and access XMP metadata
248
pdf = pikepdf.open('document.pdf')
249
250
try:
251
# Create XMP metadata handler
252
metadata = pikepdf.PdfMetadata(pdf)
253
254
print("XMP Metadata:")
255
print(f"Title: {metadata.title}")
256
print(f"Author: {metadata.author}")
257
print(f"Subject: {metadata.subject}")
258
print(f"Keywords: {metadata.keywords}")
259
print(f"Creator: {metadata.creator}")
260
print(f"Producer: {metadata.producer}")
261
print(f"Creation Date: {metadata.creation_date}")
262
print(f"Modification Date: {metadata.modification_date}")
263
print(f"PDF/A Status: {metadata.pdfa_status}")
264
265
# Update XMP metadata
266
metadata.title = "Comprehensive PDF Guide"
267
metadata.author = "Technical Writing Team"
268
metadata.subject = "Complete guide to PDF operations using pikepdf"
269
metadata.keywords = "PDF, Python, pikepdf, documentation, tutorial"
270
metadata.creator = "Python Documentation Generator"
271
272
# Set dates in ISO format
273
now = datetime.now().isoformat()
274
metadata.creation_date = now
275
metadata.modification_date = now
276
277
# Synchronize XMP with document info
278
metadata.save_to_docinfo(pdf.docinfo)
279
280
pdf.save('xmp_updated.pdf')
281
print("XMP metadata updated successfully")
282
283
except pikepdf.DependencyError:
284
print("XMP processing libraries not available - using basic metadata only")
285
286
# Fall back to basic document info
287
docinfo = pdf.docinfo
288
docinfo['/Title'] = pikepdf.String("Comprehensive PDF Guide")
289
docinfo['/Author'] = pikepdf.String("Technical Writing Team")
290
pdf.save('basic_metadata_updated.pdf')
291
292
pdf.close()
293
```
294
295
### PDF/A Compliance and Metadata
296
297
```python
298
import pikepdf
299
from datetime import datetime
300
301
def create_pdfa_compliant_document():
302
"""Create a PDF/A compliant document with proper metadata."""
303
304
pdf = pikepdf.new()
305
page = pdf.add_blank_page()
306
307
# Add minimal content
308
content = """
309
BT
310
/F1 12 Tf
311
100 700 Td
312
(PDF/A Compliant Document) Tj
313
ET
314
"""
315
content_stream = pikepdf.Stream(pdf, content.encode())
316
page['/Contents'] = content_stream
317
318
try:
319
# Set up XMP metadata for PDF/A compliance
320
metadata = pikepdf.PdfMetadata(pdf)
321
322
# Required metadata for PDF/A
323
metadata.title = "PDF/A Compliant Document"
324
metadata.author = "Document Generator"
325
metadata.subject = "Sample PDF/A document with complete metadata"
326
metadata.keywords = "PDF/A, compliance, archival, standard"
327
metadata.creator = "Python pikepdf library"
328
metadata.producer = f"pikepdf {pikepdf.__version__}"
329
330
# Set required dates
331
now = datetime.now().isoformat()
332
metadata.creation_date = now
333
metadata.modification_date = now
334
335
# Synchronize with document info
336
metadata.save_to_docinfo(pdf.docinfo)
337
338
# Additional PDF/A requirements would include:
339
# - Embedded fonts
340
# - Color profile
341
# - Proper XMP packet
342
# - No encryption
343
# - No external dependencies
344
345
pdf.save('pdfa_compliant.pdf')
346
print(f"Created PDF/A compliant document with metadata")
347
print(f"PDF/A Status: {metadata.pdfa_status}")
348
349
except pikepdf.DependencyError:
350
print("XMP libraries not available - cannot create full PDF/A compliance")
351
352
pdf.close()
353
354
create_pdfa_compliant_document()
355
```
356
357
### Metadata Analysis and Reporting
358
359
```python
360
import pikepdf
361
from pathlib import Path
362
from datetime import datetime
363
364
def analyze_pdf_metadata(pdf_path):
365
"""Analyze metadata in a PDF file."""
366
367
try:
368
pdf = pikepdf.open(pdf_path)
369
analysis = {
370
'file': str(pdf_path),
371
'file_size': pdf_path.stat().st_size,
372
'pages': len(pdf.pages),
373
'pdf_version': pdf.pdf_version,
374
'is_encrypted': pdf.is_encrypted
375
}
376
377
# Document info metadata
378
docinfo = pdf.docinfo
379
analysis['docinfo'] = {
380
'title': str(docinfo.get('/Title', '')),
381
'author': str(docinfo.get('/Author', '')),
382
'subject': str(docinfo.get('/Subject', '')),
383
'keywords': str(docinfo.get('/Keywords', '')),
384
'creator': str(docinfo.get('/Creator', '')),
385
'producer': str(docinfo.get('/Producer', '')),
386
'creation_date': str(docinfo.get('/CreationDate', '')),
387
'modification_date': str(docinfo.get('/ModDate', '')),
388
'trapped': str(docinfo.get('/Trapped', ''))
389
}
390
391
# Try XMP metadata
392
try:
393
metadata = pikepdf.PdfMetadata(pdf)
394
analysis['xmp'] = {
395
'title': metadata.title,
396
'author': metadata.author,
397
'subject': metadata.subject,
398
'keywords': metadata.keywords,
399
'creator': metadata.creator,
400
'producer': metadata.producer,
401
'creation_date': metadata.creation_date,
402
'modification_date': metadata.modification_date,
403
'pdfa_status': metadata.pdfa_status
404
}
405
analysis['has_xmp'] = True
406
except pikepdf.DependencyError:
407
analysis['has_xmp'] = False
408
analysis['xmp_error'] = "XMP libraries not available"
409
except Exception as e:
410
analysis['has_xmp'] = False
411
analysis['xmp_error'] = str(e)
412
413
pdf.close()
414
return analysis
415
416
except Exception as e:
417
return {'file': str(pdf_path), 'error': str(e)}
418
419
def metadata_report(directory_path):
420
"""Generate a comprehensive metadata report for PDFs in a directory."""
421
422
directory = Path(directory_path)
423
pdf_files = list(directory.glob('*.pdf'))
424
425
print(f"PDF Metadata Report for: {directory}")
426
print("=" * 80)
427
428
for pdf_file in pdf_files:
429
analysis = analyze_pdf_metadata(pdf_file)
430
431
if 'error' in analysis:
432
print(f"\nβ {pdf_file.name}: {analysis['error']}")
433
continue
434
435
print(f"\nπ {pdf_file.name}")
436
print(f" Size: {analysis['file_size']:,} bytes, "
437
f"Pages: {analysis['pages']}, "
438
f"Version: {analysis['pdf_version']}")
439
440
if analysis['is_encrypted']:
441
print(f" π ENCRYPTED")
442
443
# Document Info metadata
444
docinfo = analysis['docinfo']
445
if any(docinfo.values()):
446
print(f" Document Info:")
447
if docinfo['title']: print(f" Title: {docinfo['title']}")
448
if docinfo['author']: print(f" Author: {docinfo['author']}")
449
if docinfo['creator']: print(f" Creator: {docinfo['creator']}")
450
if docinfo['producer']: print(f" Producer: {docinfo['producer']}")
451
if docinfo['creation_date']: print(f" Created: {docinfo['creation_date']}")
452
if docinfo['modification_date']: print(f" Modified: {docinfo['modification_date']}")
453
else:
454
print(f" π No Document Info metadata")
455
456
# XMP metadata
457
if analysis['has_xmp']:
458
xmp = analysis['xmp']
459
if any([xmp['title'], xmp['author'], xmp['subject']]):
460
print(f" XMP Metadata:")
461
if xmp['title']: print(f" Title: {xmp['title']}")
462
if xmp['author']: print(f" Author: {xmp['author']}")
463
if xmp['subject']: print(f" Subject: {xmp['subject']}")
464
if xmp['pdfa_status']: print(f" PDF/A: {xmp['pdfa_status']}")
465
else:
466
print(f" π XMP present but minimal")
467
elif 'xmp_error' in analysis:
468
print(f" β οΈ XMP: {analysis['xmp_error']}")
469
470
# Generate metadata report
471
# metadata_report('.')
472
```
473
474
### Batch Metadata Operations
475
476
```python
477
import pikepdf
478
from pathlib import Path
479
from datetime import datetime
480
481
def standardize_metadata(directory_path, template_metadata):
482
"""Standardize metadata across multiple PDF files."""
483
484
directory = Path(directory_path)
485
pdf_files = list(directory.glob('*.pdf'))
486
results = {'updated': [], 'failed': [], 'skipped': []}
487
488
for pdf_file in pdf_files:
489
try:
490
# Skip encrypted files
491
pdf = pikepdf.open(pdf_file)
492
if pdf.is_encrypted:
493
results['skipped'].append((str(pdf_file), "Encrypted"))
494
pdf.close()
495
continue
496
497
# Update document info
498
docinfo = pdf.docinfo
499
500
# Apply template metadata
501
if template_metadata.get('author'):
502
docinfo['/Author'] = pikepdf.String(template_metadata['author'])
503
if template_metadata.get('creator'):
504
docinfo['/Creator'] = pikepdf.String(template_metadata['creator'])
505
if template_metadata.get('producer'):
506
docinfo['/Producer'] = pikepdf.String(template_metadata['producer'])
507
508
# Update modification date
509
current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")
510
docinfo['/ModDate'] = pikepdf.String(current_date)
511
512
# Preserve existing title if present, otherwise use filename
513
if not docinfo.get('/Title'):
514
title = pdf_file.stem.replace('_', ' ').replace('-', ' ').title()
515
docinfo['/Title'] = pikepdf.String(title)
516
517
# Try XMP update if available
518
try:
519
metadata = pikepdf.PdfMetadata(pdf)
520
if template_metadata.get('author'):
521
metadata.author = template_metadata['author']
522
if template_metadata.get('creator'):
523
metadata.creator = template_metadata['creator']
524
if template_metadata.get('producer'):
525
metadata.producer = template_metadata['producer']
526
527
metadata.modification_date = datetime.now().isoformat()
528
metadata.save_to_docinfo(docinfo)
529
except pikepdf.DependencyError:
530
pass # XMP not available, document info is sufficient
531
532
# Save changes
533
pdf.save()
534
pdf.close()
535
results['updated'].append(str(pdf_file))
536
537
except Exception as e:
538
results['failed'].append((str(pdf_file), str(e)))
539
try:
540
pdf.close()
541
except:
542
pass
543
544
print(f"Metadata standardization complete:")
545
print(f" Updated: {len(results['updated'])} files")
546
print(f" Failed: {len(results['failed'])} files")
547
print(f" Skipped: {len(results['skipped'])} files")
548
549
return results
550
551
# Standardize metadata with template
552
template = {
553
'author': 'Corporate Documentation Team',
554
'creator': 'Document Management System',
555
'producer': f'pikepdf {pikepdf.__version__}'
556
}
557
558
# results = standardize_metadata('.', template)
559
```
560
561
### Custom Metadata Fields
562
563
```python
564
import pikepdf
565
566
def add_custom_metadata(pdf_path, custom_fields):
567
"""Add custom metadata fields to a PDF."""
568
569
pdf = pikepdf.open(pdf_path)
570
docinfo = pdf.docinfo
571
572
# Add custom fields to document info
573
for field_name, field_value in custom_fields.items():
574
# Custom fields should use proper PDF name format
575
pdf_field_name = f'/{field_name}'
576
docinfo[pdf_field_name] = pikepdf.String(str(field_value))
577
578
# Also try to add to XMP if available
579
try:
580
metadata = pikepdf.PdfMetadata(pdf)
581
582
# Custom XMP properties would require namespace registration
583
# For basic use, document info is sufficient
584
metadata.save_to_docinfo(docinfo)
585
586
except pikepdf.DependencyError:
587
pass
588
589
pdf.save()
590
pdf.close()
591
print(f"Added custom metadata to {pdf_path}")
592
593
# Add custom metadata
594
custom_metadata = {
595
'Department': 'Engineering',
596
'Project': 'API Documentation',
597
'Version': '2.1.0',
598
'Status': 'Final',
599
'ReviewedBy': 'Technical Lead',
600
'ApprovalDate': '2024-09-10',
601
'DocumentID': 'DOC-2024-001',
602
'SecurityClass': 'Internal'
603
}
604
605
# add_custom_metadata('document.pdf', custom_metadata)
606
607
def extract_custom_metadata(pdf_path):
608
"""Extract and display all metadata including custom fields."""
609
610
pdf = pikepdf.open(pdf_path)
611
docinfo = pdf.docinfo
612
613
print(f"All metadata for: {pdf_path}")
614
print("=" * 50)
615
616
# Standard fields
617
standard_fields = ['/Title', '/Author', '/Subject', '/Keywords',
618
'/Creator', '/Producer', '/CreationDate', '/ModDate', '/Trapped']
619
620
print("Standard Fields:")
621
for field in standard_fields:
622
if field in docinfo:
623
print(f" {field[1:]}: {docinfo[field]}")
624
625
# Custom fields (anything not in standard list)
626
custom_fields = [key for key in docinfo.keys() if key not in standard_fields]
627
628
if custom_fields:
629
print("\nCustom Fields:")
630
for field in custom_fields:
631
print(f" {field[1:]}: {docinfo[field]}")
632
else:
633
print("\nNo custom fields found")
634
635
pdf.close()
636
637
# Extract all metadata including custom fields
638
# extract_custom_metadata('document.pdf')
639
```