or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdattachments.mdcontent-streams.mdcore-operations.mdencryption.mdforms.mdimages.mdindex.mdmetadata.mdobjects.mdoutlines.mdpages.md

metadata.mddocs/

0

# Metadata and Document Properties

1

2

Document metadata, XMP data, and PDF properties including titles, authors, creation dates, and custom metadata fields. These capabilities enable comprehensive document information management and standards compliance.

3

4

## Capabilities

5

6

### PdfMetadata Class

7

8

Comprehensive XMP metadata management with PDF/A compliance and standards support.

9

10

```python { .api }

11

class PdfMetadata:

12

"""

13

XMP metadata handler for PDF documents.

14

15

Provides access to document metadata following the XMP (Extensible Metadata Platform)

16

standard, with support for Dublin Core, PDF, and custom metadata schemas.

17

"""

18

19

def __init__(self, pdf: Pdf, *, sync_docinfo: bool = True) -> None:

20

"""

21

Create a metadata handler for a PDF document.

22

23

Parameters:

24

- pdf (Pdf): PDF document to manage metadata for

25

- sync_docinfo (bool): Automatically synchronize with document info dictionary

26

27

Raises:

28

DependencyError: If required XMP libraries are not available

29

"""

30

31

@property

32

def pdfa_status(self) -> str:

33

"""

34

PDF/A compliance status of the document.

35

36

Returns:

37

str: PDF/A status ('1A', '1B', '2A', '2B', '2U', '3A', '3B', '3U', or empty if not PDF/A)

38

"""

39

40

def load_from_docinfo(self, docinfo: Dictionary, *, delete_missing: bool = False) -> None:

41

"""

42

Load metadata from a document info dictionary.

43

44

Parameters:

45

- docinfo (Dictionary): Document info dictionary to load from

46

- delete_missing (bool): Delete existing metadata not found in docinfo

47

"""

48

49

def save_to_docinfo(self, docinfo: Dictionary) -> None:

50

"""

51

Save metadata to a document info dictionary.

52

53

Parameters:

54

- docinfo (Dictionary): Document info dictionary to update

55

"""

56

57

@property

58

def title(self) -> str:

59

"""

60

Document title.

61

62

Returns:

63

str: Title of the document

64

"""

65

66

@title.setter

67

def title(self, value: str) -> None:

68

"""Set document title."""

69

70

@property

71

def author(self) -> str:

72

"""

73

Document author.

74

75

Returns:

76

str: Author name or names

77

"""

78

79

@author.setter

80

def author(self, value: str) -> None:

81

"""Set document author."""

82

83

@property

84

def subject(self) -> str:

85

"""

86

Document subject or description.

87

88

Returns:

89

str: Subject description

90

"""

91

92

@subject.setter

93

def subject(self, value: str) -> None:

94

"""Set document subject."""

95

96

@property

97

def keywords(self) -> str:

98

"""

99

Document keywords.

100

101

Returns:

102

str: Keywords (typically comma-separated)

103

"""

104

105

@keywords.setter

106

def keywords(self, value: str) -> None:

107

"""Set document keywords."""

108

109

@property

110

def creator(self) -> str:

111

"""

112

Application that created the original document.

113

114

Returns:

115

str: Name of creating application

116

"""

117

118

@creator.setter

119

def creator(self, value: str) -> None:

120

"""Set document creator."""

121

122

@property

123

def producer(self) -> str:

124

"""

125

Application that converted/produced the PDF.

126

127

Returns:

128

str: Name of PDF producing application

129

"""

130

131

@producer.setter

132

def producer(self, value: str) -> None:

133

"""Set document producer."""

134

135

@property

136

def creation_date(self) -> str:

137

"""

138

Document creation date in ISO format.

139

140

Returns:

141

str: Creation date (ISO 8601 format)

142

"""

143

144

@creation_date.setter

145

def creation_date(self, value: str) -> None:

146

"""Set document creation date."""

147

148

@property

149

def modification_date(self) -> str:

150

"""

151

Document modification date in ISO format.

152

153

Returns:

154

str: Last modification date (ISO 8601 format)

155

"""

156

157

@modification_date.setter

158

def modification_date(self, value: str) -> None:

159

"""Set document modification date."""

160

```

161

162

### Document Info Dictionary Access

163

164

Direct access to PDF document information dictionary for legacy metadata.

165

166

```python { .api }

167

# Accessed via pdf.docinfo property

168

class DocumentInfo(Dictionary):

169

"""

170

PDF document information dictionary.

171

172

Legacy metadata storage using PDF's built-in document info dictionary.

173

Modern documents should use XMP metadata, but this provides compatibility.

174

"""

175

176

# Standard document info entries (accessed as dictionary keys):

177

# '/Title': Document title

178

# '/Author': Document author

179

# '/Subject': Document subject

180

# '/Keywords': Document keywords

181

# '/Creator': Creating application

182

# '/Producer': PDF producer application

183

# '/CreationDate': Creation date (PDF date format)

184

# '/ModDate': Modification date (PDF date format)

185

# '/Trapped': Trapping status (/True, /False, /Unknown)

186

```

187

188

### Metadata Exceptions

189

190

Specialized exceptions for metadata operations.

191

192

```python { .api }

193

class DependencyError(Exception):

194

"""

195

Raised when required metadata processing libraries are missing.

196

197

Metadata operations may require additional Python packages

198

for XMP processing and date handling.

199

"""

200

```

201

202

## Usage Examples

203

204

### Basic Metadata Operations

205

206

```python

207

import pikepdf

208

from datetime import datetime

209

210

# Open or create a PDF

211

pdf = pikepdf.open('document.pdf')

212

213

# Access document info dictionary (legacy metadata)

214

docinfo = pdf.docinfo

215

216

# Read existing metadata

217

print("Current metadata:")

218

print(f"Title: {docinfo.get('/Title', 'No title')}")

219

print(f"Author: {docinfo.get('/Author', 'No author')}")

220

print(f"Subject: {docinfo.get('/Subject', 'No subject')}")

221

print(f"Keywords: {docinfo.get('/Keywords', 'No keywords')}")

222

print(f"Creator: {docinfo.get('/Creator', 'No creator')}")

223

print(f"Producer: {docinfo.get('/Producer', 'No producer')}")

224

225

# Update metadata

226

docinfo['/Title'] = pikepdf.String('Updated Document Title')

227

docinfo['/Author'] = pikepdf.String('Jane Doe')

228

docinfo['/Subject'] = pikepdf.String('Technical Documentation')

229

docinfo['/Keywords'] = pikepdf.String('PDF, documentation, technical, guide')

230

docinfo['/Creator'] = pikepdf.String('Python Script')

231

232

# Set creation and modification dates

233

current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")

234

docinfo['/CreationDate'] = pikepdf.String(current_date)

235

docinfo['/ModDate'] = pikepdf.String(current_date)

236

237

pdf.save('updated_metadata.pdf')

238

pdf.close()

239

```

240

241

### Working with XMP Metadata

242

243

```python

244

import pikepdf

245

from datetime import datetime

246

247

# Open PDF and access XMP metadata

248

pdf = pikepdf.open('document.pdf')

249

250

try:

251

# Create XMP metadata handler

252

metadata = pikepdf.PdfMetadata(pdf)

253

254

print("XMP Metadata:")

255

print(f"Title: {metadata.title}")

256

print(f"Author: {metadata.author}")

257

print(f"Subject: {metadata.subject}")

258

print(f"Keywords: {metadata.keywords}")

259

print(f"Creator: {metadata.creator}")

260

print(f"Producer: {metadata.producer}")

261

print(f"Creation Date: {metadata.creation_date}")

262

print(f"Modification Date: {metadata.modification_date}")

263

print(f"PDF/A Status: {metadata.pdfa_status}")

264

265

# Update XMP metadata

266

metadata.title = "Comprehensive PDF Guide"

267

metadata.author = "Technical Writing Team"

268

metadata.subject = "Complete guide to PDF operations using pikepdf"

269

metadata.keywords = "PDF, Python, pikepdf, documentation, tutorial"

270

metadata.creator = "Python Documentation Generator"

271

272

# Set dates in ISO format

273

now = datetime.now().isoformat()

274

metadata.creation_date = now

275

metadata.modification_date = now

276

277

# Synchronize XMP with document info

278

metadata.save_to_docinfo(pdf.docinfo)

279

280

pdf.save('xmp_updated.pdf')

281

print("XMP metadata updated successfully")

282

283

except pikepdf.DependencyError:

284

print("XMP processing libraries not available - using basic metadata only")

285

286

# Fall back to basic document info

287

docinfo = pdf.docinfo

288

docinfo['/Title'] = pikepdf.String("Comprehensive PDF Guide")

289

docinfo['/Author'] = pikepdf.String("Technical Writing Team")

290

pdf.save('basic_metadata_updated.pdf')

291

292

pdf.close()

293

```

294

295

### PDF/A Compliance and Metadata

296

297

```python

298

import pikepdf

299

from datetime import datetime

300

301

def create_pdfa_compliant_document():

302

"""Create a PDF/A compliant document with proper metadata."""

303

304

pdf = pikepdf.new()

305

page = pdf.add_blank_page()

306

307

# Add minimal content

308

content = """

309

BT

310

/F1 12 Tf

311

100 700 Td

312

(PDF/A Compliant Document) Tj

313

ET

314

"""

315

content_stream = pikepdf.Stream(pdf, content.encode())

316

page['/Contents'] = content_stream

317

318

try:

319

# Set up XMP metadata for PDF/A compliance

320

metadata = pikepdf.PdfMetadata(pdf)

321

322

# Required metadata for PDF/A

323

metadata.title = "PDF/A Compliant Document"

324

metadata.author = "Document Generator"

325

metadata.subject = "Sample PDF/A document with complete metadata"

326

metadata.keywords = "PDF/A, compliance, archival, standard"

327

metadata.creator = "Python pikepdf library"

328

metadata.producer = f"pikepdf {pikepdf.__version__}"

329

330

# Set required dates

331

now = datetime.now().isoformat()

332

metadata.creation_date = now

333

metadata.modification_date = now

334

335

# Synchronize with document info

336

metadata.save_to_docinfo(pdf.docinfo)

337

338

# Additional PDF/A requirements would include:

339

# - Embedded fonts

340

# - Color profile

341

# - Proper XMP packet

342

# - No encryption

343

# - No external dependencies

344

345

pdf.save('pdfa_compliant.pdf')

346

print(f"Created PDF/A compliant document with metadata")

347

print(f"PDF/A Status: {metadata.pdfa_status}")

348

349

except pikepdf.DependencyError:

350

print("XMP libraries not available - cannot create full PDF/A compliance")

351

352

pdf.close()

353

354

create_pdfa_compliant_document()

355

```

356

357

### Metadata Analysis and Reporting

358

359

```python

360

import pikepdf

361

from pathlib import Path

362

from datetime import datetime

363

364

def analyze_pdf_metadata(pdf_path):

365

"""Analyze metadata in a PDF file."""

366

367

try:

368

pdf = pikepdf.open(pdf_path)

369

analysis = {

370

'file': str(pdf_path),

371

'file_size': pdf_path.stat().st_size,

372

'pages': len(pdf.pages),

373

'pdf_version': pdf.pdf_version,

374

'is_encrypted': pdf.is_encrypted

375

}

376

377

# Document info metadata

378

docinfo = pdf.docinfo

379

analysis['docinfo'] = {

380

'title': str(docinfo.get('/Title', '')),

381

'author': str(docinfo.get('/Author', '')),

382

'subject': str(docinfo.get('/Subject', '')),

383

'keywords': str(docinfo.get('/Keywords', '')),

384

'creator': str(docinfo.get('/Creator', '')),

385

'producer': str(docinfo.get('/Producer', '')),

386

'creation_date': str(docinfo.get('/CreationDate', '')),

387

'modification_date': str(docinfo.get('/ModDate', '')),

388

'trapped': str(docinfo.get('/Trapped', ''))

389

}

390

391

# Try XMP metadata

392

try:

393

metadata = pikepdf.PdfMetadata(pdf)

394

analysis['xmp'] = {

395

'title': metadata.title,

396

'author': metadata.author,

397

'subject': metadata.subject,

398

'keywords': metadata.keywords,

399

'creator': metadata.creator,

400

'producer': metadata.producer,

401

'creation_date': metadata.creation_date,

402

'modification_date': metadata.modification_date,

403

'pdfa_status': metadata.pdfa_status

404

}

405

analysis['has_xmp'] = True

406

except pikepdf.DependencyError:

407

analysis['has_xmp'] = False

408

analysis['xmp_error'] = "XMP libraries not available"

409

except Exception as e:

410

analysis['has_xmp'] = False

411

analysis['xmp_error'] = str(e)

412

413

pdf.close()

414

return analysis

415

416

except Exception as e:

417

return {'file': str(pdf_path), 'error': str(e)}

418

419

def metadata_report(directory_path):

420

"""Generate a comprehensive metadata report for PDFs in a directory."""

421

422

directory = Path(directory_path)

423

pdf_files = list(directory.glob('*.pdf'))

424

425

print(f"PDF Metadata Report for: {directory}")

426

print("=" * 80)

427

428

for pdf_file in pdf_files:

429

analysis = analyze_pdf_metadata(pdf_file)

430

431

if 'error' in analysis:

432

print(f"\n❌ {pdf_file.name}: {analysis['error']}")

433

continue

434

435

print(f"\nπŸ“„ {pdf_file.name}")

436

print(f" Size: {analysis['file_size']:,} bytes, "

437

f"Pages: {analysis['pages']}, "

438

f"Version: {analysis['pdf_version']}")

439

440

if analysis['is_encrypted']:

441

print(f" πŸ”’ ENCRYPTED")

442

443

# Document Info metadata

444

docinfo = analysis['docinfo']

445

if any(docinfo.values()):

446

print(f" Document Info:")

447

if docinfo['title']: print(f" Title: {docinfo['title']}")

448

if docinfo['author']: print(f" Author: {docinfo['author']}")

449

if docinfo['creator']: print(f" Creator: {docinfo['creator']}")

450

if docinfo['producer']: print(f" Producer: {docinfo['producer']}")

451

if docinfo['creation_date']: print(f" Created: {docinfo['creation_date']}")

452

if docinfo['modification_date']: print(f" Modified: {docinfo['modification_date']}")

453

else:

454

print(f" πŸ“‹ No Document Info metadata")

455

456

# XMP metadata

457

if analysis['has_xmp']:

458

xmp = analysis['xmp']

459

if any([xmp['title'], xmp['author'], xmp['subject']]):

460

print(f" XMP Metadata:")

461

if xmp['title']: print(f" Title: {xmp['title']}")

462

if xmp['author']: print(f" Author: {xmp['author']}")

463

if xmp['subject']: print(f" Subject: {xmp['subject']}")

464

if xmp['pdfa_status']: print(f" PDF/A: {xmp['pdfa_status']}")

465

else:

466

print(f" πŸ“‹ XMP present but minimal")

467

elif 'xmp_error' in analysis:

468

print(f" ⚠️ XMP: {analysis['xmp_error']}")

469

470

# Generate metadata report

471

# metadata_report('.')

472

```

473

474

### Batch Metadata Operations

475

476

```python

477

import pikepdf

478

from pathlib import Path

479

from datetime import datetime

480

481

def standardize_metadata(directory_path, template_metadata):

482

"""Standardize metadata across multiple PDF files."""

483

484

directory = Path(directory_path)

485

pdf_files = list(directory.glob('*.pdf'))

486

results = {'updated': [], 'failed': [], 'skipped': []}

487

488

for pdf_file in pdf_files:

489

try:

490

# Skip encrypted files

491

pdf = pikepdf.open(pdf_file)

492

if pdf.is_encrypted:

493

results['skipped'].append((str(pdf_file), "Encrypted"))

494

pdf.close()

495

continue

496

497

# Update document info

498

docinfo = pdf.docinfo

499

500

# Apply template metadata

501

if template_metadata.get('author'):

502

docinfo['/Author'] = pikepdf.String(template_metadata['author'])

503

if template_metadata.get('creator'):

504

docinfo['/Creator'] = pikepdf.String(template_metadata['creator'])

505

if template_metadata.get('producer'):

506

docinfo['/Producer'] = pikepdf.String(template_metadata['producer'])

507

508

# Update modification date

509

current_date = datetime.now().strftime("D:%Y%m%d%H%M%S%z")

510

docinfo['/ModDate'] = pikepdf.String(current_date)

511

512

# Preserve existing title if present, otherwise use filename

513

if not docinfo.get('/Title'):

514

title = pdf_file.stem.replace('_', ' ').replace('-', ' ').title()

515

docinfo['/Title'] = pikepdf.String(title)

516

517

# Try XMP update if available

518

try:

519

metadata = pikepdf.PdfMetadata(pdf)

520

if template_metadata.get('author'):

521

metadata.author = template_metadata['author']

522

if template_metadata.get('creator'):

523

metadata.creator = template_metadata['creator']

524

if template_metadata.get('producer'):

525

metadata.producer = template_metadata['producer']

526

527

metadata.modification_date = datetime.now().isoformat()

528

metadata.save_to_docinfo(docinfo)

529

except pikepdf.DependencyError:

530

pass # XMP not available, document info is sufficient

531

532

# Save changes

533

pdf.save()

534

pdf.close()

535

results['updated'].append(str(pdf_file))

536

537

except Exception as e:

538

results['failed'].append((str(pdf_file), str(e)))

539

try:

540

pdf.close()

541

except:

542

pass

543

544

print(f"Metadata standardization complete:")

545

print(f" Updated: {len(results['updated'])} files")

546

print(f" Failed: {len(results['failed'])} files")

547

print(f" Skipped: {len(results['skipped'])} files")

548

549

return results

550

551

# Standardize metadata with template

552

template = {

553

'author': 'Corporate Documentation Team',

554

'creator': 'Document Management System',

555

'producer': f'pikepdf {pikepdf.__version__}'

556

}

557

558

# results = standardize_metadata('.', template)

559

```

560

561

### Custom Metadata Fields

562

563

```python

564

import pikepdf

565

566

def add_custom_metadata(pdf_path, custom_fields):

567

"""Add custom metadata fields to a PDF."""

568

569

pdf = pikepdf.open(pdf_path)

570

docinfo = pdf.docinfo

571

572

# Add custom fields to document info

573

for field_name, field_value in custom_fields.items():

574

# Custom fields should use proper PDF name format

575

pdf_field_name = f'/{field_name}'

576

docinfo[pdf_field_name] = pikepdf.String(str(field_value))

577

578

# Also try to add to XMP if available

579

try:

580

metadata = pikepdf.PdfMetadata(pdf)

581

582

# Custom XMP properties would require namespace registration

583

# For basic use, document info is sufficient

584

metadata.save_to_docinfo(docinfo)

585

586

except pikepdf.DependencyError:

587

pass

588

589

pdf.save()

590

pdf.close()

591

print(f"Added custom metadata to {pdf_path}")

592

593

# Add custom metadata

594

custom_metadata = {

595

'Department': 'Engineering',

596

'Project': 'API Documentation',

597

'Version': '2.1.0',

598

'Status': 'Final',

599

'ReviewedBy': 'Technical Lead',

600

'ApprovalDate': '2024-09-10',

601

'DocumentID': 'DOC-2024-001',

602

'SecurityClass': 'Internal'

603

}

604

605

# add_custom_metadata('document.pdf', custom_metadata)

606

607

def extract_custom_metadata(pdf_path):

608

"""Extract and display all metadata including custom fields."""

609

610

pdf = pikepdf.open(pdf_path)

611

docinfo = pdf.docinfo

612

613

print(f"All metadata for: {pdf_path}")

614

print("=" * 50)

615

616

# Standard fields

617

standard_fields = ['/Title', '/Author', '/Subject', '/Keywords',

618

'/Creator', '/Producer', '/CreationDate', '/ModDate', '/Trapped']

619

620

print("Standard Fields:")

621

for field in standard_fields:

622

if field in docinfo:

623

print(f" {field[1:]}: {docinfo[field]}")

624

625

# Custom fields (anything not in standard list)

626

custom_fields = [key for key in docinfo.keys() if key not in standard_fields]

627

628

if custom_fields:

629

print("\nCustom Fields:")

630

for field in custom_fields:

631

print(f" {field[1:]}: {docinfo[field]}")

632

else:

633

print("\nNo custom fields found")

634

635

pdf.close()

636

637

# Extract all metadata including custom fields

638

# extract_custom_metadata('document.pdf')

639

```