Tessl Tile for pypi/pikepdf@9.10.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced.md attachments.md content-streams.md core-operations.md encryption.md forms.md images.md index.md metadata.md objects.md outlines.md pages.md

metadata.mddocs/

0
# Metadata and Document Properties
1

2
Document metadata, XMP data, and PDF properties including titles, authors, creation dates, and custom metadata fields. These capabilities enable comprehensive document information management and standards compliance.
3

4
## Capabilities
5

6
### PdfMetadata Class
7

8
Comprehensive XMP metadata management with PDF/A compliance and standards support.
9

10
```python { .api }
11
class PdfMetadata:
12
    """
13
    XMP metadata handler for PDF documents.
14
    
15
    Provides access to document metadata following the XMP (Extensible Metadata Platform)
16
    standard, with support for Dublin Core, PDF, and custom metadata schemas.
17
    """
18
    
19
    def __init__(self, pdf: Pdf, *, sync_docinfo: bool = True) -> None:
20
        """
21
        Create a metadata handler for a PDF document.
22
        
23
        Parameters:
24
        - pdf (Pdf): PDF document to manage metadata for
25
        - sync_docinfo (bool): Automatically synchronize with document info dictionary
26
        
27
        Raises:
28
        DependencyError: If required XMP libraries are not available
29
        """
30
    
31
    @property
32
    def pdfa_status(self) -> str:
33
        """
34
        PDF/A compliance status of the document.
35
        
36
        Returns:
37
        str: PDF/A status ('1A', '1B', '2A', '2B', '2U', '3A', '3B', '3U', or empty if not PDF/A)
38
        """
39
    
40
    def load_from_docinfo(self, docinfo: Dictionary, *, delete_missing: bool = False) -> None:
41
        """
42
        Load metadata from a document info dictionary.
43
        
44
        Parameters:
45
        - docinfo (Dictionary): Document info dictionary to load from
46
        - delete_missing (bool): Delete existing metadata not found in docinfo
47
        """
48
    
49
    def save_to_docinfo(self, docinfo: Dictionary) -> None:
50
        """
51
        Save metadata to a document info dictionary.
52
        
53
        Parameters:
54
        - docinfo (Dictionary): Document info dictionary to update
55
        """
56
    
57
    @property
58
    def title(self) -> str:
59
        """
60
        Document title.
61
        
62
        Returns:
63
        str: Title of the document
64
        """
65
    
66
    @title.setter
67
    def title(self, value: str) -> None:
68
        """Set document title."""
69
    
70
    @property
71
    def author(self) -> str:
72
        """
73
        Document author.
74
        
75
        Returns:
76
        str: Author name or names
77
        """
78
    
79
    @author.setter
80
    def author(self, value: str) -> None:
81
        """Set document author."""
82
    
83
    @property
84
    def subject(self) -> str:
85
        """
86
        Document subject or description.
87
        
88
        Returns:
89
        str: Subject description
90
        """
91
    
92
    @subject.setter
93
    def subject(self, value: str) -> None:
94
        """Set document subject."""
95
    
96
    @property
97
    def keywords(self) -> str:
98
        """
99
        Document keywords.
100
        
101
        Returns:
102
        str: Keywords (typically comma-separated)
103
        """
104
    
105
    @keywords.setter
106
    def keywords(self, value: str) -> None:
107
        """Set document keywords."""
108
    
109
    @property
110
    def creator(self) -> str:
111
        """
112
        Application that created the original document.
113
        
114
        Returns:
115
        str: Name of creating application
116
        """
117
    
118
    @creator.setter
119
    def creator(self, value: str) -> None:
120
        """Set document creator."""
121
    
122
    @property
123
    def producer(self) -> str:
124
        """
125
        Application that converted/produced the PDF.
126
        
127
        Returns:
128
        str: Name of PDF producing application
129
        """
130
    
131
    @producer.setter
132
    def producer(self, value: str) -> None:
133
        """Set document producer."""
134
    
135
    @property
136
    def creation_date(self) -> str:
137
        """
138
        Document creation date in ISO format.
139
        
140
        Returns:
141
        str: Creation date (ISO 8601 format)
142
        """
143
    
144
    @creation_date.setter
145
    def creation_date(self, value: str) -> None:
146
        """Set document creation date."""
147
    
148
    @property
149
    def modification_date(self) -> str:
150
        """
151
        Document modification date in ISO format.
152
        
153
        Returns:
154
        str: Last modification date (ISO 8601 format)
155
        """
156
    
157
    @modification_date.setter
158
    def modification_date(self, value: str) -> None:
159
        """Set document modification date."""
160
```
161

162
### Document Info Dictionary Access
163

164
Direct access to PDF document information dictionary for legacy metadata.
165

166
```python { .api }
167
# Accessed via pdf.docinfo property
168
class DocumentInfo(Dictionary):
169
    """
170
    PDF document information dictionary.
171
    
172
    Legacy metadata storage using PDF's built-in document info dictionary.
173
    Modern documents should use XMP metadata, but this provides compatibility.
174
    """
175
    
176
    # Standard document info entries (accessed as dictionary keys):
177
    # '/Title': Document title  
178
    # '/Author': Document author
179
    # '/Subject': Document subject
180
    # '/Keywords': Document keywords
181
    # '/Creator': Creating application
182
    # '/Producer': PDF producer application
183
    # '/CreationDate': Creation date (PDF date format)
184
    # '/ModDate': Modification date (PDF date format)
185
    # '/Trapped': Trapping status (/True, /False, /Unknown)
186
```
187

188
### Metadata Exceptions
189

190
Specialized exceptions for metadata operations.
191

192
```python { .api }
193
class DependencyError(Exception):
194
    """
195
    Raised when required metadata processing libraries are missing.
196
    
197
    Metadata operations may require additional Python packages
198
    for XMP processing and date handling.
199
    """
200
```
201

202
## Usage Examples
203

204
### Basic Metadata Operations
205

206
```python
207
import pikepdf
208
from datetime import datetime
209

210
# Open or create a PDF
211
pdf = pikepdf.open('document.pdf')
212

213
# Access document info dictionary (legacy metadata)
214
docinfo = pdf.docinfo
215

216
# Read existing metadata
217
print("Current metadata:")
218
print(f"Title: {docinfo.get('/Title', 'No title')}")
219
print(f"Author: {docinfo.get('/Author', 'No author')}")
220
print(f"Subject: {docinfo.get('/Subject', 'No subject')}")
221
print(f"Keywords: {docinfo.get('/Keywords', 'No keywords')}")
222
print(f"Creator: {docinfo.get('/Creator', 'No creator')}")
223
print(f"Producer: {docinfo.get('/Producer', 'No producer')}")
224

225
# Update metadata
226
docinfo['/Title'] = pikepdf.String('Updated Document Title')
227
docinfo['/Author'] = pikepdf.String('Jane Doe')
228
docinfo['/Subject'] = pikepdf.String('Technical Documentation')
229
docinfo['/Keywords'] = pikepdf.String('PDF, documentation, technical, guide')
230
docinfo['/Creator'] = pikepdf.String('Python Script')
231

232
# Set creation and modification dates
233
current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")
234
docinfo['/CreationDate'] = pikepdf.String(current_date)
235
docinfo['/ModDate'] = pikepdf.String(current_date)
236

237
pdf.save('updated_metadata.pdf')
238
pdf.close()
239
```
240

241
### Working with XMP Metadata
242

243
```python
244
import pikepdf
245
from datetime import datetime
246

247
# Open PDF and access XMP metadata
248
pdf = pikepdf.open('document.pdf')
249

250
try:
251
    # Create XMP metadata handler
252
    metadata = pikepdf.PdfMetadata(pdf)
253
    
254
    print("XMP Metadata:")
255
    print(f"Title: {metadata.title}")
256
    print(f"Author: {metadata.author}")
257
    print(f"Subject: {metadata.subject}")
258
    print(f"Keywords: {metadata.keywords}")
259
    print(f"Creator: {metadata.creator}")
260
    print(f"Producer: {metadata.producer}")
261
    print(f"Creation Date: {metadata.creation_date}")
262
    print(f"Modification Date: {metadata.modification_date}")
263
    print(f"PDF/A Status: {metadata.pdfa_status}")
264
    
265
    # Update XMP metadata
266
    metadata.title = "Comprehensive PDF Guide"
267
    metadata.author = "Technical Writing Team"
268
    metadata.subject = "Complete guide to PDF operations using pikepdf"
269
    metadata.keywords = "PDF, Python, pikepdf, documentation, tutorial"
270
    metadata.creator = "Python Documentation Generator"
271
    
272
    # Set dates in ISO format
273
    now = datetime.now().isoformat()
274
    metadata.creation_date = now
275
    metadata.modification_date = now
276
    
277
    # Synchronize XMP with document info
278
    metadata.save_to_docinfo(pdf.docinfo)
279
    
280
    pdf.save('xmp_updated.pdf')
281
    print("XMP metadata updated successfully")
282
    
283
except pikepdf.DependencyError:
284
    print("XMP processing libraries not available - using basic metadata only")
285
    
286
    # Fall back to basic document info
287
    docinfo = pdf.docinfo
288
    docinfo['/Title'] = pikepdf.String("Comprehensive PDF Guide")
289
    docinfo['/Author'] = pikepdf.String("Technical Writing Team") 
290
    pdf.save('basic_metadata_updated.pdf')
291

292
pdf.close()
293
```
294

295
### PDF/A Compliance and Metadata
296

297
```python
298
import pikepdf
299
from datetime import datetime
300

301
def create_pdfa_compliant_document():
302
    """Create a PDF/A compliant document with proper metadata."""
303
    
304
    pdf = pikepdf.new()
305
    page = pdf.add_blank_page()
306
    
307
    # Add minimal content
308
    content = """
309
    BT
310
    /F1 12 Tf
311
    100 700 Td
312
    (PDF/A Compliant Document) Tj
313
    ET
314
    """
315
    content_stream = pikepdf.Stream(pdf, content.encode())
316
    page['/Contents'] = content_stream
317
    
318
    try:
319
        # Set up XMP metadata for PDF/A compliance
320
        metadata = pikepdf.PdfMetadata(pdf)
321
        
322
        # Required metadata for PDF/A
323
        metadata.title = "PDF/A Compliant Document"
324
        metadata.author = "Document Generator"
325
        metadata.subject = "Sample PDF/A document with complete metadata"
326
        metadata.keywords = "PDF/A, compliance, archival, standard"
327
        metadata.creator = "Python pikepdf library"
328
        metadata.producer = f"pikepdf {pikepdf.__version__}"
329
        
330
        # Set required dates
331
        now = datetime.now().isoformat()
332
        metadata.creation_date = now
333
        metadata.modification_date = now
334
        
335
        # Synchronize with document info
336
        metadata.save_to_docinfo(pdf.docinfo)
337
        
338
        # Additional PDF/A requirements would include:
339
        # - Embedded fonts
340
        # - Color profile
341
        # - Proper XMP packet
342
        # - No encryption
343
        # - No external dependencies
344
        
345
        pdf.save('pdfa_compliant.pdf')
346
        print(f"Created PDF/A compliant document with metadata")
347
        print(f"PDF/A Status: {metadata.pdfa_status}")
348
        
349
    except pikepdf.DependencyError:
350
        print("XMP libraries not available - cannot create full PDF/A compliance")
351
    
352
    pdf.close()
353

354
create_pdfa_compliant_document()
355
```
356

357
### Metadata Analysis and Reporting
358

359
```python
360
import pikepdf
361
from pathlib import Path
362
from datetime import datetime
363

364
def analyze_pdf_metadata(pdf_path):
365
    """Analyze metadata in a PDF file."""
366
    
367
    try:
368
        pdf = pikepdf.open(pdf_path)
369
        analysis = {
370
            'file': str(pdf_path),
371
            'file_size': pdf_path.stat().st_size,
372
            'pages': len(pdf.pages),
373
            'pdf_version': pdf.pdf_version,
374
            'is_encrypted': pdf.is_encrypted
375
        }
376
        
377
        # Document info metadata
378
        docinfo = pdf.docinfo
379
        analysis['docinfo'] = {
380
            'title': str(docinfo.get('/Title', '')),
381
            'author': str(docinfo.get('/Author', '')),
382
            'subject': str(docinfo.get('/Subject', '')),
383
            'keywords': str(docinfo.get('/Keywords', '')),
384
            'creator': str(docinfo.get('/Creator', '')),
385
            'producer': str(docinfo.get('/Producer', '')),
386
            'creation_date': str(docinfo.get('/CreationDate', '')),
387
            'modification_date': str(docinfo.get('/ModDate', '')),
388
            'trapped': str(docinfo.get('/Trapped', ''))
389
        }
390
        
391
        # Try XMP metadata
392
        try:
393
            metadata = pikepdf.PdfMetadata(pdf)
394
            analysis['xmp'] = {
395
                'title': metadata.title,
396
                'author': metadata.author,
397
                'subject': metadata.subject,
398
                'keywords': metadata.keywords,
399
                'creator': metadata.creator,
400
                'producer': metadata.producer,
401
                'creation_date': metadata.creation_date,
402
                'modification_date': metadata.modification_date,
403
                'pdfa_status': metadata.pdfa_status
404
            }
405
            analysis['has_xmp'] = True
406
        except pikepdf.DependencyError:
407
            analysis['has_xmp'] = False
408
            analysis['xmp_error'] = "XMP libraries not available"
409
        except Exception as e:
410
            analysis['has_xmp'] = False
411
            analysis['xmp_error'] = str(e)
412
        
413
        pdf.close()
414
        return analysis
415
        
416
    except Exception as e:
417
        return {'file': str(pdf_path), 'error': str(e)}
418

419
def metadata_report(directory_path):
420
    """Generate a comprehensive metadata report for PDFs in a directory."""
421
    
422
    directory = Path(directory_path)
423
    pdf_files = list(directory.glob('*.pdf'))
424
    
425
    print(f"PDF Metadata Report for: {directory}")
426
    print("=" * 80)
427
    
428
    for pdf_file in pdf_files:
429
        analysis = analyze_pdf_metadata(pdf_file)
430
        
431
        if 'error' in analysis:
432
            print(f"\n❌ {pdf_file.name}: {analysis['error']}")
433
            continue
434
        
435
        print(f"\n📄 {pdf_file.name}")
436
        print(f"   Size: {analysis['file_size']:,} bytes, "
437
              f"Pages: {analysis['pages']}, "
438
              f"Version: {analysis['pdf_version']}")
439
        
440
        if analysis['is_encrypted']:
441
            print(f"   🔒 ENCRYPTED")
442
        
443
        # Document Info metadata
444
        docinfo = analysis['docinfo']
445
        if any(docinfo.values()):
446
            print(f"   Document Info:")
447
            if docinfo['title']: print(f"     Title: {docinfo['title']}")
448
            if docinfo['author']: print(f"     Author: {docinfo['author']}")
449
            if docinfo['creator']: print(f"     Creator: {docinfo['creator']}")
450
            if docinfo['producer']: print(f"     Producer: {docinfo['producer']}")
451
            if docinfo['creation_date']: print(f"     Created: {docinfo['creation_date']}")
452
            if docinfo['modification_date']: print(f"     Modified: {docinfo['modification_date']}")
453
        else:
454
            print(f"   📋 No Document Info metadata")
455
        
456
        # XMP metadata
457
        if analysis['has_xmp']:
458
            xmp = analysis['xmp']
459
            if any([xmp['title'], xmp['author'], xmp['subject']]):
460
                print(f"   XMP Metadata:")
461
                if xmp['title']: print(f"     Title: {xmp['title']}")
462
                if xmp['author']: print(f"     Author: {xmp['author']}")
463
                if xmp['subject']: print(f"     Subject: {xmp['subject']}")
464
                if xmp['pdfa_status']: print(f"     PDF/A: {xmp['pdfa_status']}")
465
            else:
466
                print(f"   📋 XMP present but minimal")
467
        elif 'xmp_error' in analysis:
468
            print(f"   ⚠️  XMP: {analysis['xmp_error']}")
469

470
# Generate metadata report
471
# metadata_report('.')
472
```
473

474
### Batch Metadata Operations
475

476
```python
477
import pikepdf
478
from pathlib import Path
479
from datetime import datetime
480

481
def standardize_metadata(directory_path, template_metadata):
482
    """Standardize metadata across multiple PDF files."""
483
    
484
    directory = Path(directory_path)
485
    pdf_files = list(directory.glob('*.pdf'))
486
    results = {'updated': [], 'failed': [], 'skipped': []}
487
    
488
    for pdf_file in pdf_files:
489
        try:
490
            # Skip encrypted files
491
            pdf = pikepdf.open(pdf_file)
492
            if pdf.is_encrypted:
493
                results['skipped'].append((str(pdf_file), "Encrypted"))
494
                pdf.close()
495
                continue
496
            
497
            # Update document info
498
            docinfo = pdf.docinfo
499
            
500
            # Apply template metadata
501
            if template_metadata.get('author'):
502
                docinfo['/Author'] = pikepdf.String(template_metadata['author'])
503
            if template_metadata.get('creator'):
504
                docinfo['/Creator'] = pikepdf.String(template_metadata['creator'])
505
            if template_metadata.get('producer'):
506
                docinfo['/Producer'] = pikepdf.String(template_metadata['producer'])
507
            
508
            # Update modification date
509
            current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")
510
            docinfo['/ModDate'] = pikepdf.String(current_date)
511
            
512
            # Preserve existing title if present, otherwise use filename
513
            if not docinfo.get('/Title'):
514
                title = pdf_file.stem.replace('_', ' ').replace('-', ' ').title()
515
                docinfo['/Title'] = pikepdf.String(title)
516
            
517
            # Try XMP update if available
518
            try:
519
                metadata = pikepdf.PdfMetadata(pdf)
520
                if template_metadata.get('author'):
521
                    metadata.author = template_metadata['author']
522
                if template_metadata.get('creator'):
523
                    metadata.creator = template_metadata['creator']
524
                if template_metadata.get('producer'):
525
                    metadata.producer = template_metadata['producer']
526
                
527
                metadata.modification_date = datetime.now().isoformat()
528
                metadata.save_to_docinfo(docinfo)
529
            except pikepdf.DependencyError:
530
                pass  # XMP not available, document info is sufficient
531
            
532
            # Save changes
533
            pdf.save()
534
            pdf.close()
535
            results['updated'].append(str(pdf_file))
536
            
537
        except Exception as e:
538
            results['failed'].append((str(pdf_file), str(e)))
539
            try:
540
                pdf.close()
541
            except:
542
                pass
543
    
544
    print(f"Metadata standardization complete:")
545
    print(f"  Updated: {len(results['updated'])} files")
546
    print(f"  Failed: {len(results['failed'])} files") 
547
    print(f"  Skipped: {len(results['skipped'])} files")
548
    
549
    return results
550

551
# Standardize metadata with template
552
template = {
553
    'author': 'Corporate Documentation Team',
554
    'creator': 'Document Management System',
555
    'producer': f'pikepdf {pikepdf.__version__}'
556
}
557

558
# results = standardize_metadata('.', template)
559
```
560

561
### Custom Metadata Fields
562

563
```python
564
import pikepdf
565

566
def add_custom_metadata(pdf_path, custom_fields):
567
    """Add custom metadata fields to a PDF."""
568
    
569
    pdf = pikepdf.open(pdf_path)
570
    docinfo = pdf.docinfo
571
    
572
    # Add custom fields to document info
573
    for field_name, field_value in custom_fields.items():
574
        # Custom fields should use proper PDF name format
575
        pdf_field_name = f'/{field_name}'
576
        docinfo[pdf_field_name] = pikepdf.String(str(field_value))
577
    
578
    # Also try to add to XMP if available
579
    try:
580
        metadata = pikepdf.PdfMetadata(pdf)
581
        
582
        # Custom XMP properties would require namespace registration
583
        # For basic use, document info is sufficient
584
        metadata.save_to_docinfo(docinfo)
585
        
586
    except pikepdf.DependencyError:
587
        pass
588
    
589
    pdf.save()
590
    pdf.close()
591
    print(f"Added custom metadata to {pdf_path}")
592

593
# Add custom metadata
594
custom_metadata = {
595
    'Department': 'Engineering',
596
    'Project': 'API Documentation',
597
    'Version': '2.1.0',
598
    'Status': 'Final',
599
    'ReviewedBy': 'Technical Lead',
600
    'ApprovalDate': '2024-09-10',
601
    'DocumentID': 'DOC-2024-001',
602
    'SecurityClass': 'Internal'
603
}
604

605
# add_custom_metadata('document.pdf', custom_metadata)
606

607
def extract_custom_metadata(pdf_path):
608
    """Extract and display all metadata including custom fields."""
609
    
610
    pdf = pikepdf.open(pdf_path)
611
    docinfo = pdf.docinfo
612
    
613
    print(f"All metadata for: {pdf_path}")
614
    print("=" * 50)
615
    
616
    # Standard fields
617
    standard_fields = ['/Title', '/Author', '/Subject', '/Keywords', 
618
                      '/Creator', '/Producer', '/CreationDate', '/ModDate', '/Trapped']
619
    
620
    print("Standard Fields:")
621
    for field in standard_fields:
622
        if field in docinfo:
623
            print(f"  {field[1:]}: {docinfo[field]}")
624
    
625
    # Custom fields (anything not in standard list)
626
    custom_fields = [key for key in docinfo.keys() if key not in standard_fields]
627
    
628
    if custom_fields:
629
        print("\nCustom Fields:")
630
        for field in custom_fields:
631
            print(f"  {field[1:]}: {docinfo[field]}")
632
    else:
633
        print("\nNo custom fields found")
634
    
635
    pdf.close()
636

637
# Extract all metadata including custom fields  
638
# extract_custom_metadata('document.pdf')
639
```

Version

Tile

Files

metadata.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

metadata.mddocs/