Tessl Tile for pypi/pypdf@6.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

annotations.md form-fields.md index.md metadata.md page-operations.md reading-writing.md text-extraction.md utilities.md

metadata.mddocs/

0
# Metadata
1

2
Access and manipulation of PDF metadata, document properties, XMP information, and custom document attributes. pypdf provides comprehensive metadata handling for both reading existing information and setting new properties.
3

4
## Capabilities
5

6
### Document Information
7

8
The DocumentInformation class provides access to standard PDF metadata fields with both processed and raw value access.
9

10
```python { .api }
11
class DocumentInformation:
12
    @property
13
    def title(self) -> str | None:
14
        """Get the document title (processed)."""
15
        
16
    @property
17
    def title_raw(self) -> str | None:
18
        """Get the raw document title."""
19

20
    @property
21
    def author(self) -> str | None:
22
        """Get the document author (processed)."""
23
        
24
    @property
25
    def author_raw(self) -> str | None:
26
        """Get the raw document author."""
27

28
    @property
29
    def subject(self) -> str | None:
30
        """Get the document subject (processed)."""
31
        
32
    @property
33
    def subject_raw(self) -> str | None:
34
        """Get the raw document subject."""
35

36
    @property
37
    def creator(self) -> str | None:
38
        """Get the creating application (processed)."""
39
        
40
    @property
41
    def creator_raw(self) -> str | None:
42
        """Get the raw creating application."""
43

44
    @property
45
    def producer(self) -> str | None:
46
        """Get the PDF producer (processed)."""
47
        
48
    @property
49
    def producer_raw(self) -> str | None:
50
        """Get the raw PDF producer."""
51

52
    @property
53
    def creation_date(self) -> datetime | None:
54
        """Get the creation date as datetime object."""
55
        
56
    @property
57
    def creation_date_raw(self) -> str | None:
58
        """Get the raw creation date string."""
59

60
    @property
61
    def modification_date(self) -> datetime | None:
62
        """Get the modification date as datetime object."""
63
        
64
    @property
65
    def modification_date_raw(self) -> str | None:
66
        """Get the raw modification date string."""
67

68
    @property
69
    def keywords(self) -> str | None:
70
        """Get the document keywords (processed)."""
71
        
72
    @property
73
    def keywords_raw(self) -> str | None:
74
        """Get the raw document keywords."""
75
```
76

77
### XMP Metadata
78

79
Extended metadata support through XMP (Extensible Metadata Platform) for advanced metadata handling.
80

81
```python { .api }
82
class XmpInformation:
83
    """XMP metadata information class for advanced metadata handling."""
84
    
85
    def get_element(self, about_uri: str, namespace: str, name: str):
86
        """
87
        Get an XMP metadata element.
88
        
89
        Args:
90
            about_uri: URI identifying the resource
91
            namespace: XML namespace
92
            name: Element name
93
            
94
        Returns:
95
            Element value or None
96
        """
97

98
    def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> list:
99
        """
100
        Get all nodes in a specific namespace.
101
        
102
        Args:
103
            about_uri: URI identifying the resource
104
            namespace: XML namespace
105
            
106
        Returns:
107
            List of nodes in the namespace
108
        """
109
```
110

111
## Usage Examples
112

113
### Reading Basic Metadata
114

115
```python
116
from pypdf import PdfReader
117

118
reader = PdfReader("document.pdf")
119
metadata = reader.metadata
120

121
if metadata:
122
    print(f"Title: {metadata.title}")
123
    print(f"Author: {metadata.author}")
124
    print(f"Subject: {metadata.subject}")
125
    print(f"Creator: {metadata.creator}")
126
    print(f"Producer: {metadata.producer}")
127
    print(f"Creation Date: {metadata.creation_date}")
128
    print(f"Modification Date: {metadata.modification_date}")
129
    print(f"Keywords: {metadata.keywords}")
130
else:
131
    print("No metadata available")
132
```
133

134
### Reading Raw Metadata
135

136
```python
137
from pypdf import PdfReader
138

139
reader = PdfReader("document.pdf")
140
metadata = reader.metadata
141

142
if metadata:
143
    # Compare processed vs raw values
144
    print("Processed values:")
145
    print(f"  Title: {metadata.title}")
146
    print(f"  Author: {metadata.author}")
147
    
148
    print("\nRaw values:")
149
    print(f"  Title: {metadata.title_raw}")
150
    print(f"  Author: {metadata.author_raw}")
151
```
152

153
### Writing Metadata
154

155
```python
156
from pypdf import PdfReader, PdfWriter
157
from datetime import datetime
158

159
reader = PdfReader("input.pdf")
160
writer = PdfWriter()
161

162
# Copy all pages
163
for page in reader.pages:
164
    writer.add_page(page)
165

166
# Set metadata
167
writer.add_metadata({
168
    "/Title": "Updated Document Title",
169
    "/Author": "John Doe",
170
    "/Subject": "Updated document subject",
171
    "/Creator": "My Application",
172
    "/Producer": "pypdf",
173
    "/Keywords": "PDF, metadata, pypdf",
174
    "/CreationDate": datetime.now(),
175
    "/ModDate": datetime.now()
176
})
177

178
with open("output_with_metadata.pdf", "wb") as output:
179
    writer.write(output)
180
```
181

182
### Copying and Modifying Metadata
183

184
```python
185
from pypdf import PdfReader, PdfWriter
186
from datetime import datetime
187

188
reader = PdfReader("input.pdf")
189
writer = PdfWriter()
190

191
# Copy pages
192
for page in reader.pages:
193
    writer.add_page(page)
194

195
# Get existing metadata
196
existing_metadata = reader.metadata
197

198
# Create updated metadata dictionary
199
new_metadata = {}
200
if existing_metadata:
201
    # Copy existing metadata
202
    if existing_metadata.title:
203
        new_metadata["/Title"] = existing_metadata.title
204
    if existing_metadata.author:
205
        new_metadata["/Author"] = existing_metadata.author
206
    if existing_metadata.subject:
207
        new_metadata["/Subject"] = existing_metadata.subject
208
    if existing_metadata.creator:
209
        new_metadata["/Creator"] = existing_metadata.creator
210
    if existing_metadata.keywords:
211
        new_metadata["/Keywords"] = existing_metadata.keywords
212

213
# Update specific fields
214
new_metadata["/Producer"] = "pypdf 6.0.0"
215
new_metadata["/ModDate"] = datetime.now()
216

217
# Add custom metadata
218
new_metadata["/Custom"] = "Custom metadata value"
219

220
writer.add_metadata(new_metadata)
221

222
with open("updated_metadata.pdf", "wb") as output:
223
    writer.write(output)
224
```
225

226
### Working with XMP Metadata
227

228
```python
229
from pypdf import PdfReader
230

231
reader = PdfReader("document_with_xmp.pdf")
232

233
# Check if XMP metadata exists
234
if reader.xmp_metadata:
235
    print("XMP metadata found")
236
    
237
    # Get Dublin Core elements
238
    dc_namespace = "http://purl.org/dc/elements/1.1/"
239
    about_uri = ""
240
    
241
    try:
242
        title = reader.xmp_metadata.get_element(about_uri, dc_namespace, "title")
243
        creator = reader.xmp_metadata.get_element(about_uri, dc_namespace, "creator")
244
        description = reader.xmp_metadata.get_element(about_uri, dc_namespace, "description")
245
        
246
        print(f"DC Title: {title}")
247
        print(f"DC Creator: {creator}")
248
        print(f"DC Description: {description}")
249
        
250
    except Exception as e:
251
        print(f"Error reading XMP metadata: {e}")
252
        
253
else:
254
    print("No XMP metadata found")
255
```
256

257
### Metadata Extraction Report
258

259
```python
260
from pypdf import PdfReader
261
from datetime import datetime
262
import json
263

264
def extract_metadata_report(pdf_path: str) -> dict:
265
    """
266
    Extract comprehensive metadata report from a PDF.
267
    
268
    Args:
269
        pdf_path: Path to PDF file
270
        
271
    Returns:
272
        Dictionary containing all metadata information
273
    """
274
    report = {
275
        "file_path": pdf_path,
276
        "extraction_time": datetime.now().isoformat(),
277
        "basic_metadata": {},
278
        "raw_metadata": {},
279
        "xmp_metadata": {},
280
        "document_info": {}
281
    }
282
    
283
    try:
284
        reader = PdfReader(pdf_path)
285
        
286
        # Basic document information
287
        report["document_info"] = {
288
            "page_count": len(reader.pages),
289
            "is_encrypted": reader.is_encrypted,
290
            "pdf_header": reader.pdf_header
291
        }
292
        
293
        # Standard metadata
294
        if reader.metadata:
295
            metadata = reader.metadata
296
            
297
            # Processed metadata
298
            report["basic_metadata"] = {
299
                "title": metadata.title,
300
                "author": metadata.author,
301
                "subject": metadata.subject,
302
                "creator": metadata.creator,
303
                "producer": metadata.producer,
304
                "creation_date": metadata.creation_date.isoformat() if metadata.creation_date else None,
305
                "modification_date": metadata.modification_date.isoformat() if metadata.modification_date else None,
306
                "keywords": metadata.keywords
307
            }
308
            
309
            # Raw metadata
310
            report["raw_metadata"] = {
311
                "title_raw": metadata.title_raw,
312
                "author_raw": metadata.author_raw,
313
                "subject_raw": metadata.subject_raw,
314
                "creator_raw": metadata.creator_raw,
315
                "producer_raw": metadata.producer_raw,
316
                "creation_date_raw": metadata.creation_date_raw,
317
                "modification_date_raw": metadata.modification_date_raw,
318
                "keywords_raw": metadata.keywords_raw
319
            }
320
        
321
        # XMP metadata
322
        if reader.xmp_metadata:
323
            report["xmp_metadata"]["present"] = True
324
            # Note: XMP parsing would require more specific implementation
325
            # based on the actual XMP structure in the document
326
        else:
327
            report["xmp_metadata"]["present"] = False
328
            
329
    except Exception as e:
330
        report["error"] = str(e)
331
    
332
    return report
333

334
# Generate metadata report
335
report = extract_metadata_report("document.pdf")
336
print(json.dumps(report, indent=2))
337
```
338

339
### Batch Metadata Processing
340

341
```python
342
from pypdf import PdfReader, PdfWriter
343
from pathlib import Path
344
import csv
345
from datetime import datetime
346

347
def extract_metadata_to_csv(pdf_directory: str, csv_output: str):
348
    """
349
    Extract metadata from all PDFs in a directory to CSV.
350
    
351
    Args:
352
        pdf_directory: Directory containing PDF files
353
        csv_output: Output CSV file path
354
    """
355
    
356
    metadata_records = []
357
    
358
    for pdf_path in Path(pdf_directory).glob("*.pdf"):
359
        try:
360
            reader = PdfReader(str(pdf_path))
361
            metadata = reader.metadata
362
            
363
            record = {
364
                "filename": pdf_path.name,
365
                "title": metadata.title if metadata else "",
366
                "author": metadata.author if metadata else "",
367
                "subject": metadata.subject if metadata else "",
368
                "creator": metadata.creator if metadata else "",
369
                "producer": metadata.producer if metadata else "",
370
                "creation_date": metadata.creation_date if metadata else "",
371
                "modification_date": metadata.modification_date if metadata else "",
372
                "keywords": metadata.keywords if metadata else "",
373
                "page_count": len(reader.pages),
374
                "is_encrypted": reader.is_encrypted,
375
                "pdf_version": reader.pdf_header
376
            }
377
            
378
            metadata_records.append(record)
379
            
380
        except Exception as e:
381
            print(f"Error processing {pdf_path.name}: {e}")
382
    
383
    # Write to CSV
384
    if metadata_records:
385
        with open(csv_output, 'w', newline='', encoding='utf-8') as csvfile:
386
            fieldnames = metadata_records[0].keys()
387
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
388
            
389
            writer.writeheader()
390
            for record in metadata_records:
391
                writer.writerow(record)
392
        
393
        print(f"Metadata extracted to {csv_output}")
394
        print(f"Processed {len(metadata_records)} PDF files")
395

396
# Extract metadata from all PDFs to CSV
397
extract_metadata_to_csv("pdf_collection/", "pdf_metadata.csv")
398
```
399

400
### Setting Custom Metadata Fields
401

402
```python
403
from pypdf import PdfReader, PdfWriter
404
from datetime import datetime
405

406
reader = PdfReader("input.pdf")
407
writer = PdfWriter()
408

409
# Copy pages
410
for page in reader.pages:
411
    writer.add_page(page)
412

413
# Set comprehensive metadata with custom fields
414
metadata = {
415
    # Standard fields
416
    "/Title": "My Document",
417
    "/Author": "Jane Smith",
418
    "/Subject": "Important Document",
419
    "/Creator": "My Application v2.0",
420
    "/Producer": "pypdf 6.0.0",
421
    "/Keywords": "important, document, processing",
422
    "/CreationDate": datetime.now(),
423
    "/ModDate": datetime.now(),
424
    
425
    # Custom fields
426
    "/Department": "Engineering",
427
    "/ProjectCode": "PROJ-2024-001",
428
    "/Classification": "Internal",
429
    "/ReviewDate": datetime(2024, 12, 31),
430
    "/Version": "1.0",
431
    "/ApprovedBy": "Manager Name"
432
}
433

434
writer.add_metadata(metadata)
435

436
with open("document_with_custom_metadata.pdf", "wb") as output:
437
    writer.write(output)
438
```

Version

Tile

Files

metadata.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

metadata.mddocs/