0
# Metadata
1
2
Access and manipulation of PDF metadata, document properties, XMP information, and custom document attributes. pypdf provides comprehensive metadata handling for both reading existing information and setting new properties.
3
4
## Capabilities
5
6
### Document Information
7
8
The DocumentInformation class provides access to standard PDF metadata fields with both processed and raw value access.
9
10
```python { .api }
11
class DocumentInformation:
12
@property
13
def title(self) -> str | None:
14
"""Get the document title (processed)."""
15
16
@property
17
def title_raw(self) -> str | None:
18
"""Get the raw document title."""
19
20
@property
21
def author(self) -> str | None:
22
"""Get the document author (processed)."""
23
24
@property
25
def author_raw(self) -> str | None:
26
"""Get the raw document author."""
27
28
@property
29
def subject(self) -> str | None:
30
"""Get the document subject (processed)."""
31
32
@property
33
def subject_raw(self) -> str | None:
34
"""Get the raw document subject."""
35
36
@property
37
def creator(self) -> str | None:
38
"""Get the creating application (processed)."""
39
40
@property
41
def creator_raw(self) -> str | None:
42
"""Get the raw creating application."""
43
44
@property
45
def producer(self) -> str | None:
46
"""Get the PDF producer (processed)."""
47
48
@property
49
def producer_raw(self) -> str | None:
50
"""Get the raw PDF producer."""
51
52
@property
53
def creation_date(self) -> datetime | None:
54
"""Get the creation date as datetime object."""
55
56
@property
57
def creation_date_raw(self) -> str | None:
58
"""Get the raw creation date string."""
59
60
@property
61
def modification_date(self) -> datetime | None:
62
"""Get the modification date as datetime object."""
63
64
@property
65
def modification_date_raw(self) -> str | None:
66
"""Get the raw modification date string."""
67
68
@property
69
def keywords(self) -> str | None:
70
"""Get the document keywords (processed)."""
71
72
@property
73
def keywords_raw(self) -> str | None:
74
"""Get the raw document keywords."""
75
```
76
77
### XMP Metadata
78
79
Extended metadata support through XMP (Extensible Metadata Platform) for advanced metadata handling.
80
81
```python { .api }
82
class XmpInformation:
83
"""XMP metadata information class for advanced metadata handling."""
84
85
def get_element(self, about_uri: str, namespace: str, name: str):
86
"""
87
Get an XMP metadata element.
88
89
Args:
90
about_uri: URI identifying the resource
91
namespace: XML namespace
92
name: Element name
93
94
Returns:
95
Element value or None
96
"""
97
98
def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> list:
99
"""
100
Get all nodes in a specific namespace.
101
102
Args:
103
about_uri: URI identifying the resource
104
namespace: XML namespace
105
106
Returns:
107
List of nodes in the namespace
108
"""
109
```
110
111
## Usage Examples
112
113
### Reading Basic Metadata
114
115
```python
116
from pypdf import PdfReader
117
118
reader = PdfReader("document.pdf")
119
metadata = reader.metadata
120
121
if metadata:
122
print(f"Title: {metadata.title}")
123
print(f"Author: {metadata.author}")
124
print(f"Subject: {metadata.subject}")
125
print(f"Creator: {metadata.creator}")
126
print(f"Producer: {metadata.producer}")
127
print(f"Creation Date: {metadata.creation_date}")
128
print(f"Modification Date: {metadata.modification_date}")
129
print(f"Keywords: {metadata.keywords}")
130
else:
131
print("No metadata available")
132
```
133
134
### Reading Raw Metadata
135
136
```python
137
from pypdf import PdfReader
138
139
reader = PdfReader("document.pdf")
140
metadata = reader.metadata
141
142
if metadata:
143
# Compare processed vs raw values
144
print("Processed values:")
145
print(f" Title: {metadata.title}")
146
print(f" Author: {metadata.author}")
147
148
print("\nRaw values:")
149
print(f" Title: {metadata.title_raw}")
150
print(f" Author: {metadata.author_raw}")
151
```
152
153
### Writing Metadata
154
155
```python
156
from pypdf import PdfReader, PdfWriter
157
from datetime import datetime
158
159
reader = PdfReader("input.pdf")
160
writer = PdfWriter()
161
162
# Copy all pages
163
for page in reader.pages:
164
writer.add_page(page)
165
166
# Set metadata
167
writer.add_metadata({
168
"/Title": "Updated Document Title",
169
"/Author": "John Doe",
170
"/Subject": "Updated document subject",
171
"/Creator": "My Application",
172
"/Producer": "pypdf",
173
"/Keywords": "PDF, metadata, pypdf",
174
"/CreationDate": datetime.now(),
175
"/ModDate": datetime.now()
176
})
177
178
with open("output_with_metadata.pdf", "wb") as output:
179
writer.write(output)
180
```
181
182
### Copying and Modifying Metadata
183
184
```python
185
from pypdf import PdfReader, PdfWriter
186
from datetime import datetime
187
188
reader = PdfReader("input.pdf")
189
writer = PdfWriter()
190
191
# Copy pages
192
for page in reader.pages:
193
writer.add_page(page)
194
195
# Get existing metadata
196
existing_metadata = reader.metadata
197
198
# Create updated metadata dictionary
199
new_metadata = {}
200
if existing_metadata:
201
# Copy existing metadata
202
if existing_metadata.title:
203
new_metadata["/Title"] = existing_metadata.title
204
if existing_metadata.author:
205
new_metadata["/Author"] = existing_metadata.author
206
if existing_metadata.subject:
207
new_metadata["/Subject"] = existing_metadata.subject
208
if existing_metadata.creator:
209
new_metadata["/Creator"] = existing_metadata.creator
210
if existing_metadata.keywords:
211
new_metadata["/Keywords"] = existing_metadata.keywords
212
213
# Update specific fields
214
new_metadata["/Producer"] = "pypdf 6.0.0"
215
new_metadata["/ModDate"] = datetime.now()
216
217
# Add custom metadata
218
new_metadata["/Custom"] = "Custom metadata value"
219
220
writer.add_metadata(new_metadata)
221
222
with open("updated_metadata.pdf", "wb") as output:
223
writer.write(output)
224
```
225
226
### Working with XMP Metadata
227
228
```python
229
from pypdf import PdfReader
230
231
reader = PdfReader("document_with_xmp.pdf")
232
233
# Check if XMP metadata exists
234
if reader.xmp_metadata:
235
print("XMP metadata found")
236
237
# Get Dublin Core elements
238
dc_namespace = "http://purl.org/dc/elements/1.1/"
239
about_uri = ""
240
241
try:
242
title = reader.xmp_metadata.get_element(about_uri, dc_namespace, "title")
243
creator = reader.xmp_metadata.get_element(about_uri, dc_namespace, "creator")
244
description = reader.xmp_metadata.get_element(about_uri, dc_namespace, "description")
245
246
print(f"DC Title: {title}")
247
print(f"DC Creator: {creator}")
248
print(f"DC Description: {description}")
249
250
except Exception as e:
251
print(f"Error reading XMP metadata: {e}")
252
253
else:
254
print("No XMP metadata found")
255
```
256
257
### Metadata Extraction Report
258
259
```python
260
from pypdf import PdfReader
261
from datetime import datetime
262
import json
263
264
def extract_metadata_report(pdf_path: str) -> dict:
265
"""
266
Extract comprehensive metadata report from a PDF.
267
268
Args:
269
pdf_path: Path to PDF file
270
271
Returns:
272
Dictionary containing all metadata information
273
"""
274
report = {
275
"file_path": pdf_path,
276
"extraction_time": datetime.now().isoformat(),
277
"basic_metadata": {},
278
"raw_metadata": {},
279
"xmp_metadata": {},
280
"document_info": {}
281
}
282
283
try:
284
reader = PdfReader(pdf_path)
285
286
# Basic document information
287
report["document_info"] = {
288
"page_count": len(reader.pages),
289
"is_encrypted": reader.is_encrypted,
290
"pdf_header": reader.pdf_header
291
}
292
293
# Standard metadata
294
if reader.metadata:
295
metadata = reader.metadata
296
297
# Processed metadata
298
report["basic_metadata"] = {
299
"title": metadata.title,
300
"author": metadata.author,
301
"subject": metadata.subject,
302
"creator": metadata.creator,
303
"producer": metadata.producer,
304
"creation_date": metadata.creation_date.isoformat() if metadata.creation_date else None,
305
"modification_date": metadata.modification_date.isoformat() if metadata.modification_date else None,
306
"keywords": metadata.keywords
307
}
308
309
# Raw metadata
310
report["raw_metadata"] = {
311
"title_raw": metadata.title_raw,
312
"author_raw": metadata.author_raw,
313
"subject_raw": metadata.subject_raw,
314
"creator_raw": metadata.creator_raw,
315
"producer_raw": metadata.producer_raw,
316
"creation_date_raw": metadata.creation_date_raw,
317
"modification_date_raw": metadata.modification_date_raw,
318
"keywords_raw": metadata.keywords_raw
319
}
320
321
# XMP metadata
322
if reader.xmp_metadata:
323
report["xmp_metadata"]["present"] = True
324
# Note: XMP parsing would require more specific implementation
325
# based on the actual XMP structure in the document
326
else:
327
report["xmp_metadata"]["present"] = False
328
329
except Exception as e:
330
report["error"] = str(e)
331
332
return report
333
334
# Generate metadata report
335
report = extract_metadata_report("document.pdf")
336
print(json.dumps(report, indent=2))
337
```
338
339
### Batch Metadata Processing
340
341
```python
342
from pypdf import PdfReader, PdfWriter
343
from pathlib import Path
344
import csv
345
from datetime import datetime
346
347
def extract_metadata_to_csv(pdf_directory: str, csv_output: str):
348
"""
349
Extract metadata from all PDFs in a directory to CSV.
350
351
Args:
352
pdf_directory: Directory containing PDF files
353
csv_output: Output CSV file path
354
"""
355
356
metadata_records = []
357
358
for pdf_path in Path(pdf_directory).glob("*.pdf"):
359
try:
360
reader = PdfReader(str(pdf_path))
361
metadata = reader.metadata
362
363
record = {
364
"filename": pdf_path.name,
365
"title": metadata.title if metadata else "",
366
"author": metadata.author if metadata else "",
367
"subject": metadata.subject if metadata else "",
368
"creator": metadata.creator if metadata else "",
369
"producer": metadata.producer if metadata else "",
370
"creation_date": metadata.creation_date if metadata else "",
371
"modification_date": metadata.modification_date if metadata else "",
372
"keywords": metadata.keywords if metadata else "",
373
"page_count": len(reader.pages),
374
"is_encrypted": reader.is_encrypted,
375
"pdf_version": reader.pdf_header
376
}
377
378
metadata_records.append(record)
379
380
except Exception as e:
381
print(f"Error processing {pdf_path.name}: {e}")
382
383
# Write to CSV
384
if metadata_records:
385
with open(csv_output, 'w', newline='', encoding='utf-8') as csvfile:
386
fieldnames = metadata_records[0].keys()
387
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
388
389
writer.writeheader()
390
for record in metadata_records:
391
writer.writerow(record)
392
393
print(f"Metadata extracted to {csv_output}")
394
print(f"Processed {len(metadata_records)} PDF files")
395
396
# Extract metadata from all PDFs to CSV
397
extract_metadata_to_csv("pdf_collection/", "pdf_metadata.csv")
398
```
399
400
### Setting Custom Metadata Fields
401
402
```python
403
from pypdf import PdfReader, PdfWriter
404
from datetime import datetime
405
406
reader = PdfReader("input.pdf")
407
writer = PdfWriter()
408
409
# Copy pages
410
for page in reader.pages:
411
writer.add_page(page)
412
413
# Set comprehensive metadata with custom fields
414
metadata = {
415
# Standard fields
416
"/Title": "My Document",
417
"/Author": "Jane Smith",
418
"/Subject": "Important Document",
419
"/Creator": "My Application v2.0",
420
"/Producer": "pypdf 6.0.0",
421
"/Keywords": "important, document, processing",
422
"/CreationDate": datetime.now(),
423
"/ModDate": datetime.now(),
424
425
# Custom fields
426
"/Department": "Engineering",
427
"/ProjectCode": "PROJ-2024-001",
428
"/Classification": "Internal",
429
"/ReviewDate": datetime(2024, 12, 31),
430
"/Version": "1.0",
431
"/ApprovedBy": "Manager Name"
432
}
433
434
writer.add_metadata(metadata)
435
436
with open("document_with_custom_metadata.pdf", "wb") as output:
437
writer.write(output)
438
```