or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

annotations.mdform-fields.mdindex.mdmetadata.mdpage-operations.mdreading-writing.mdtext-extraction.mdutilities.md

metadata.mddocs/

0

# Metadata

1

2

Access and manipulation of PDF metadata, document properties, XMP information, and custom document attributes. pypdf provides comprehensive metadata handling for both reading existing information and setting new properties.

3

4

## Capabilities

5

6

### Document Information

7

8

The DocumentInformation class provides access to standard PDF metadata fields with both processed and raw value access.

9

10

```python { .api }

11

class DocumentInformation:

12

@property

13

def title(self) -> str | None:

14

"""Get the document title (processed)."""

15

16

@property

17

def title_raw(self) -> str | None:

18

"""Get the raw document title."""

19

20

@property

21

def author(self) -> str | None:

22

"""Get the document author (processed)."""

23

24

@property

25

def author_raw(self) -> str | None:

26

"""Get the raw document author."""

27

28

@property

29

def subject(self) -> str | None:

30

"""Get the document subject (processed)."""

31

32

@property

33

def subject_raw(self) -> str | None:

34

"""Get the raw document subject."""

35

36

@property

37

def creator(self) -> str | None:

38

"""Get the creating application (processed)."""

39

40

@property

41

def creator_raw(self) -> str | None:

42

"""Get the raw creating application."""

43

44

@property

45

def producer(self) -> str | None:

46

"""Get the PDF producer (processed)."""

47

48

@property

49

def producer_raw(self) -> str | None:

50

"""Get the raw PDF producer."""

51

52

@property

53

def creation_date(self) -> datetime | None:

54

"""Get the creation date as datetime object."""

55

56

@property

57

def creation_date_raw(self) -> str | None:

58

"""Get the raw creation date string."""

59

60

@property

61

def modification_date(self) -> datetime | None:

62

"""Get the modification date as datetime object."""

63

64

@property

65

def modification_date_raw(self) -> str | None:

66

"""Get the raw modification date string."""

67

68

@property

69

def keywords(self) -> str | None:

70

"""Get the document keywords (processed)."""

71

72

@property

73

def keywords_raw(self) -> str | None:

74

"""Get the raw document keywords."""

75

```

76

77

### XMP Metadata

78

79

Extended metadata support through XMP (Extensible Metadata Platform) for advanced metadata handling.

80

81

```python { .api }

82

class XmpInformation:

83

"""XMP metadata information class for advanced metadata handling."""

84

85

def get_element(self, about_uri: str, namespace: str, name: str):

86

"""

87

Get an XMP metadata element.

88

89

Args:

90

about_uri: URI identifying the resource

91

namespace: XML namespace

92

name: Element name

93

94

Returns:

95

Element value or None

96

"""

97

98

def get_nodes_in_namespace(self, about_uri: str, namespace: str) -> list:

99

"""

100

Get all nodes in a specific namespace.

101

102

Args:

103

about_uri: URI identifying the resource

104

namespace: XML namespace

105

106

Returns:

107

List of nodes in the namespace

108

"""

109

```

110

111

## Usage Examples

112

113

### Reading Basic Metadata

114

115

```python

116

from pypdf import PdfReader

117

118

reader = PdfReader("document.pdf")

119

metadata = reader.metadata

120

121

if metadata:

122

print(f"Title: {metadata.title}")

123

print(f"Author: {metadata.author}")

124

print(f"Subject: {metadata.subject}")

125

print(f"Creator: {metadata.creator}")

126

print(f"Producer: {metadata.producer}")

127

print(f"Creation Date: {metadata.creation_date}")

128

print(f"Modification Date: {metadata.modification_date}")

129

print(f"Keywords: {metadata.keywords}")

130

else:

131

print("No metadata available")

132

```

133

134

### Reading Raw Metadata

135

136

```python

137

from pypdf import PdfReader

138

139

reader = PdfReader("document.pdf")

140

metadata = reader.metadata

141

142

if metadata:

143

# Compare processed vs raw values

144

print("Processed values:")

145

print(f" Title: {metadata.title}")

146

print(f" Author: {metadata.author}")

147

148

print("\nRaw values:")

149

print(f" Title: {metadata.title_raw}")

150

print(f" Author: {metadata.author_raw}")

151

```

152

153

### Writing Metadata

154

155

```python

156

from pypdf import PdfReader, PdfWriter

157

from datetime import datetime

158

159

reader = PdfReader("input.pdf")

160

writer = PdfWriter()

161

162

# Copy all pages

163

for page in reader.pages:

164

writer.add_page(page)

165

166

# Set metadata

167

writer.add_metadata({

168

"/Title": "Updated Document Title",

169

"/Author": "John Doe",

170

"/Subject": "Updated document subject",

171

"/Creator": "My Application",

172

"/Producer": "pypdf",

173

"/Keywords": "PDF, metadata, pypdf",

174

"/CreationDate": datetime.now(),

175

"/ModDate": datetime.now()

176

})

177

178

with open("output_with_metadata.pdf", "wb") as output:

179

writer.write(output)

180

```

181

182

### Copying and Modifying Metadata

183

184

```python

185

from pypdf import PdfReader, PdfWriter

186

from datetime import datetime

187

188

reader = PdfReader("input.pdf")

189

writer = PdfWriter()

190

191

# Copy pages

192

for page in reader.pages:

193

writer.add_page(page)

194

195

# Get existing metadata

196

existing_metadata = reader.metadata

197

198

# Create updated metadata dictionary

199

new_metadata = {}

200

if existing_metadata:

201

# Copy existing metadata

202

if existing_metadata.title:

203

new_metadata["/Title"] = existing_metadata.title

204

if existing_metadata.author:

205

new_metadata["/Author"] = existing_metadata.author

206

if existing_metadata.subject:

207

new_metadata["/Subject"] = existing_metadata.subject

208

if existing_metadata.creator:

209

new_metadata["/Creator"] = existing_metadata.creator

210

if existing_metadata.keywords:

211

new_metadata["/Keywords"] = existing_metadata.keywords

212

213

# Update specific fields

214

new_metadata["/Producer"] = "pypdf 6.0.0"

215

new_metadata["/ModDate"] = datetime.now()

216

217

# Add custom metadata

218

new_metadata["/Custom"] = "Custom metadata value"

219

220

writer.add_metadata(new_metadata)

221

222

with open("updated_metadata.pdf", "wb") as output:

223

writer.write(output)

224

```

225

226

### Working with XMP Metadata

227

228

```python

229

from pypdf import PdfReader

230

231

reader = PdfReader("document_with_xmp.pdf")

232

233

# Check if XMP metadata exists

234

if reader.xmp_metadata:

235

print("XMP metadata found")

236

237

# Get Dublin Core elements

238

dc_namespace = "http://purl.org/dc/elements/1.1/"

239

about_uri = ""

240

241

try:

242

title = reader.xmp_metadata.get_element(about_uri, dc_namespace, "title")

243

creator = reader.xmp_metadata.get_element(about_uri, dc_namespace, "creator")

244

description = reader.xmp_metadata.get_element(about_uri, dc_namespace, "description")

245

246

print(f"DC Title: {title}")

247

print(f"DC Creator: {creator}")

248

print(f"DC Description: {description}")

249

250

except Exception as e:

251

print(f"Error reading XMP metadata: {e}")

252

253

else:

254

print("No XMP metadata found")

255

```

256

257

### Metadata Extraction Report

258

259

```python

260

from pypdf import PdfReader

261

from datetime import datetime

262

import json

263

264

def extract_metadata_report(pdf_path: str) -> dict:

265

"""

266

Extract comprehensive metadata report from a PDF.

267

268

Args:

269

pdf_path: Path to PDF file

270

271

Returns:

272

Dictionary containing all metadata information

273

"""

274

report = {

275

"file_path": pdf_path,

276

"extraction_time": datetime.now().isoformat(),

277

"basic_metadata": {},

278

"raw_metadata": {},

279

"xmp_metadata": {},

280

"document_info": {}

281

}

282

283

try:

284

reader = PdfReader(pdf_path)

285

286

# Basic document information

287

report["document_info"] = {

288

"page_count": len(reader.pages),

289

"is_encrypted": reader.is_encrypted,

290

"pdf_header": reader.pdf_header

291

}

292

293

# Standard metadata

294

if reader.metadata:

295

metadata = reader.metadata

296

297

# Processed metadata

298

report["basic_metadata"] = {

299

"title": metadata.title,

300

"author": metadata.author,

301

"subject": metadata.subject,

302

"creator": metadata.creator,

303

"producer": metadata.producer,

304

"creation_date": metadata.creation_date.isoformat() if metadata.creation_date else None,

305

"modification_date": metadata.modification_date.isoformat() if metadata.modification_date else None,

306

"keywords": metadata.keywords

307

}

308

309

# Raw metadata

310

report["raw_metadata"] = {

311

"title_raw": metadata.title_raw,

312

"author_raw": metadata.author_raw,

313

"subject_raw": metadata.subject_raw,

314

"creator_raw": metadata.creator_raw,

315

"producer_raw": metadata.producer_raw,

316

"creation_date_raw": metadata.creation_date_raw,

317

"modification_date_raw": metadata.modification_date_raw,

318

"keywords_raw": metadata.keywords_raw

319

}

320

321

# XMP metadata

322

if reader.xmp_metadata:

323

report["xmp_metadata"]["present"] = True

324

# Note: XMP parsing would require more specific implementation

325

# based on the actual XMP structure in the document

326

else:

327

report["xmp_metadata"]["present"] = False

328

329

except Exception as e:

330

report["error"] = str(e)

331

332

return report

333

334

# Generate metadata report

335

report = extract_metadata_report("document.pdf")

336

print(json.dumps(report, indent=2))

337

```

338

339

### Batch Metadata Processing

340

341

```python

342

from pypdf import PdfReader, PdfWriter

343

from pathlib import Path

344

import csv

345

from datetime import datetime

346

347

def extract_metadata_to_csv(pdf_directory: str, csv_output: str):

348

"""

349

Extract metadata from all PDFs in a directory to CSV.

350

351

Args:

352

pdf_directory: Directory containing PDF files

353

csv_output: Output CSV file path

354

"""

355

356

metadata_records = []

357

358

for pdf_path in Path(pdf_directory).glob("*.pdf"):

359

try:

360

reader = PdfReader(str(pdf_path))

361

metadata = reader.metadata

362

363

record = {

364

"filename": pdf_path.name,

365

"title": metadata.title if metadata else "",

366

"author": metadata.author if metadata else "",

367

"subject": metadata.subject if metadata else "",

368

"creator": metadata.creator if metadata else "",

369

"producer": metadata.producer if metadata else "",

370

"creation_date": metadata.creation_date if metadata else "",

371

"modification_date": metadata.modification_date if metadata else "",

372

"keywords": metadata.keywords if metadata else "",

373

"page_count": len(reader.pages),

374

"is_encrypted": reader.is_encrypted,

375

"pdf_version": reader.pdf_header

376

}

377

378

metadata_records.append(record)

379

380

except Exception as e:

381

print(f"Error processing {pdf_path.name}: {e}")

382

383

# Write to CSV

384

if metadata_records:

385

with open(csv_output, 'w', newline='', encoding='utf-8') as csvfile:

386

fieldnames = metadata_records[0].keys()

387

writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

388

389

writer.writeheader()

390

for record in metadata_records:

391

writer.writerow(record)

392

393

print(f"Metadata extracted to {csv_output}")

394

print(f"Processed {len(metadata_records)} PDF files")

395

396

# Extract metadata from all PDFs to CSV

397

extract_metadata_to_csv("pdf_collection/", "pdf_metadata.csv")

398

```

399

400

### Setting Custom Metadata Fields

401

402

```python

403

from pypdf import PdfReader, PdfWriter

404

from datetime import datetime

405

406

reader = PdfReader("input.pdf")

407

writer = PdfWriter()

408

409

# Copy pages

410

for page in reader.pages:

411

writer.add_page(page)

412

413

# Set comprehensive metadata with custom fields

414

metadata = {

415

# Standard fields

416

"/Title": "My Document",

417

"/Author": "Jane Smith",

418

"/Subject": "Important Document",

419

"/Creator": "My Application v2.0",

420

"/Producer": "pypdf 6.0.0",

421

"/Keywords": "important, document, processing",

422

"/CreationDate": datetime.now(),

423

"/ModDate": datetime.now(),

424

425

# Custom fields

426

"/Department": "Engineering",

427

"/ProjectCode": "PROJ-2024-001",

428

"/Classification": "Internal",

429

"/ReviewDate": datetime(2024, 12, 31),

430

"/Version": "1.0",

431

"/ApprovedBy": "Manager Name"

432

}

433

434

writer.add_metadata(metadata)

435

436

with open("document_with_custom_metadata.pdf", "wb") as output:

437

writer.write(output)

438

```