or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcontent-processing.mddetection.mdembedded-extraction.mdembedding.mdexceptions.mdindex.mdio-utilities.mdlanguage.mdmetadata.mdmime-types.mdparsing.mdpipes.mdprocess-forking.mdrendering.md

metadata.mddocs/

0

# Metadata Management

1

2

Comprehensive metadata system for extracting, storing, and manipulating document properties with support for standard metadata schemas, custom properties, and metadata filtering operations.

3

4

## Capabilities

5

6

### Metadata Container

7

8

The central container class for document metadata, providing a flexible key-value store with support for multiple values per key and standard property interfaces.

9

10

```java { .api }

11

/**

12

* Container for document metadata properties

13

*/

14

public class Metadata implements Serializable {

15

/**

16

* Creates an empty Metadata container

17

*/

18

public Metadata();

19

20

/**

21

* Gets the first value associated with the given property name

22

* @param name Property name to retrieve

23

* @return First value for the property, or null if not set

24

*/

25

public String get(String name);

26

27

/**

28

* Gets all values associated with the given property name

29

* @param name Property name to retrieve

30

* @return Array of all values for the property, never null but may be empty

31

*/

32

public String[] getValues(String name);

33

34

/**

35

* Sets a single value for the given property, replacing any existing values

36

* @param name Property name to set

37

* @param value Value to set for the property

38

*/

39

public void set(String name, String value);

40

41

/**

42

* Adds a value to the given property, preserving existing values

43

* @param name Property name to add to

44

* @param value Value to add for the property

45

*/

46

public void add(String name, String value);

47

48

/**

49

* Removes all values for the given property

50

* @param name Property name to remove

51

*/

52

public void remove(String name);

53

54

/**

55

* Gets all property names that have been set

56

* @return Array of property names with values

57

*/

58

public String[] names();

59

60

/**

61

* Gets the number of properties with values

62

* @return Number of properties that have been set

63

*/

64

public int size();

65

66

/**

67

* Checks if any properties have been set

68

* @return true if no properties have values

69

*/

70

public boolean isEmpty();

71

}

72

```

73

74

**Usage Examples:**

75

76

```java

77

import org.apache.tika.metadata.Metadata;

78

import org.apache.tika.metadata.TikaCoreProperties;

79

import org.apache.tika.metadata.DublinCore;

80

81

// Basic metadata operations

82

Metadata metadata = new Metadata();

83

84

// Set standard properties

85

metadata.set(TikaCoreProperties.TITLE, "Document Title");

86

metadata.set(DublinCore.CREATOR, "John Doe");

87

metadata.set(TikaCoreProperties.CREATED, "2023-01-15T10:30:00Z");

88

89

// Add multiple values for same property

90

metadata.add(DublinCore.SUBJECT, "Technology");

91

metadata.add(DublinCore.SUBJECT, "Programming");

92

93

// Retrieve values

94

String title = metadata.get(TikaCoreProperties.TITLE);

95

String[] subjects = metadata.getValues(DublinCore.SUBJECT);

96

97

// Iterate through all properties

98

for (String name : metadata.names()) {

99

String[] values = metadata.getValues(name);

100

System.out.println(name + ": " + Arrays.toString(values));

101

}

102

```

103

104

### Property Interfaces

105

106

Standard property definitions organized by metadata schemas and document types.

107

108

```java { .api }

109

/**

110

* Interface defining property constants

111

*/

112

public interface Property {

113

/**

114

* Gets the property name

115

* @return String name of the property

116

*/

117

String getName();

118

119

/**

120

* Checks if this property allows multiple values

121

* @return true if multiple values are allowed

122

*/

123

boolean isMultiValuePermitted();

124

}

125

126

/**

127

* Core Tika metadata properties

128

*/

129

public interface TikaCoreProperties {

130

/** Document title */

131

Property TITLE = Property.internalText("title");

132

133

/** Document creator/author */

134

Property CREATOR = Property.internalText("dc:creator");

135

136

/** Document subject/description */

137

Property SUBJECT = Property.internalText("subject");

138

139

/** Document creation date */

140

Property CREATED = Property.internalDate("dcterms:created");

141

142

/** Document modification date */

143

Property MODIFIED = Property.internalDate("dcterms:modified");

144

145

/** Content type/MIME type */

146

Property CONTENT_TYPE = Property.internalText("Content-Type");

147

148

/** Character encoding */

149

Property CONTENT_ENCODING = Property.internalText("Content-Encoding");

150

151

/** Document language */

152

Property LANGUAGE = Property.internalText("language");

153

154

/** Resource name (filename) */

155

Property RESOURCE_NAME_KEY = Property.internalText("resourceName");

156

157

/** Number of pages */

158

Property PAGE_COUNT = Property.internalInteger("xmpTPg:NPages");

159

160

/** Number of words */

161

Property WORD_COUNT = Property.internalInteger("meta:word-count");

162

163

/** Number of characters */

164

Property CHARACTER_COUNT = Property.internalInteger("meta:character-count");

165

}

166

```

167

168

### Dublin Core Properties

169

170

Standard Dublin Core metadata elements for bibliographic information.

171

172

```java { .api }

173

/**

174

* Dublin Core metadata properties

175

*/

176

public interface DublinCore {

177

/** Document contributor */

178

Property CONTRIBUTOR = Property.internalTextBag("dc:contributor");

179

180

/** Document coverage */

181

Property COVERAGE = Property.internalText("dc:coverage");

182

183

/** Document creator */

184

Property CREATOR = Property.internalTextBag("dc:creator");

185

186

/** Document date */

187

Property DATE = Property.internalDate("dc:date");

188

189

/** Document description */

190

Property DESCRIPTION = Property.internalText("dc:description");

191

192

/** Document format */

193

Property FORMAT = Property.internalText("dc:format");

194

195

/** Document identifier */

196

Property IDENTIFIER = Property.internalText("dc:identifier");

197

198

/** Document language */

199

Property LANGUAGE = Property.internalText("dc:language");

200

201

/** Document publisher */

202

Property PUBLISHER = Property.internalText("dc:publisher");

203

204

/** Document relation */

205

Property RELATION = Property.internalText("dc:relation");

206

207

/** Document rights */

208

Property RIGHTS = Property.internalText("dc:rights");

209

210

/** Document source */

211

Property SOURCE = Property.internalText("dc:source");

212

213

/** Document subject */

214

Property SUBJECT = Property.internalTextBag("dc:subject");

215

216

/** Document title */

217

Property TITLE = Property.internalText("dc:title");

218

219

/** Document type */

220

Property TYPE = Property.internalText("dc:type");

221

}

222

```

223

224

### Office Document Properties

225

226

Properties specific to office documents (Microsoft Office, LibreOffice, etc.).

227

228

```java { .api }

229

/**

230

* Generic office document properties

231

*/

232

public interface Office {

233

/** Application name that created the document */

234

Property APPLICATION = Property.internalText("Application-Name");

235

236

/** Application version */

237

Property APPLICATION_VERSION = Property.internalText("Application-Version");

238

239

/** Document category */

240

Property CATEGORY = Property.internalText("Category");

241

242

/** Document company */

243

Property COMPANY = Property.internalText("Company");

244

245

/** Document keywords */

246

Property KEYWORDS = Property.internalTextBag("Keywords");

247

248

/** Document manager */

249

Property MANAGER = Property.internalText("Manager");

250

251

/** Document comments */

252

Property COMMENTS = Property.internalText("Comments");

253

254

/** Document template */

255

Property TEMPLATE = Property.internalText("Template");

256

257

/** Total editing time */

258

Property TOTAL_TIME = Property.internalInteger("Total-Time");

259

260

/** Document revision number */

261

Property REVISION_NUMBER = Property.internalText("Revision-Number");

262

263

/** Document security level */

264

Property SECURITY = Property.internalInteger("Security");

265

266

/** Number of slides (presentations) */

267

Property SLIDE_COUNT = Property.internalInteger("Slide-Count");

268

269

/** Number of paragraphs */

270

Property PARAGRAPH_COUNT = Property.internalInteger("Paragraph-Count");

271

272

/** Number of lines */

273

Property LINE_COUNT = Property.internalInteger("Line-Count");

274

}

275

```

276

277

### PDF-Specific Properties

278

279

Properties specific to PDF documents.

280

281

```java { .api }

282

/**

283

* PDF document properties

284

*/

285

public interface PDF {

286

/** PDF version */

287

Property PDF_VERSION = Property.internalText("pdf:PDFVersion");

288

289

/** PDF producer */

290

Property PRODUCER = Property.internalText("producer");

291

292

/** PDF encryption status */

293

Property ENCRYPTED = Property.internalBoolean("pdf:encrypted");

294

295

/** PDF permissions */

296

Property PERMISSIONS = Property.internalInteger("access_permission:extract_content");

297

298

/** PDF optimization */

299

Property OPTIMIZED = Property.internalBoolean("pdf:optimized");

300

301

/** PDF tagged */

302

Property TAGGED = Property.internalBoolean("pdf:tagged");

303

304

/** Number of characters with spaces */

305

Property CHARACTERS_WITH_SPACES = Property.internalInteger("pdf:charsWithSpaces");

306

307

/** PDF/A conformance */

308

Property PDFA_VERSION = Property.internalText("pdfa:version");

309

310

/** PDF/UA compliance */

311

Property PDFUA_VERSION = Property.internalText("pdfua:version");

312

313

/** Document ID */

314

Property DOC_INFO_ID_1 = Property.internalText("pdf:docinfo:id1");

315

316

/** Modification date from PDF info */

317

Property DOC_INFO_MODIFICATION_DATE = Property.internalDate("pdf:docinfo:modified");

318

319

/** Creation date from PDF info */

320

Property DOC_INFO_CREATION_DATE = Property.internalDate("pdf:docinfo:created");

321

}

322

```

323

324

### Image Properties

325

326

Properties for image documents and embedded images.

327

328

```java { .api }

329

/**

330

* TIFF image properties

331

*/

332

public interface TIFF {

333

/** Image width in pixels */

334

Property IMAGE_WIDTH = Property.internalInteger("tiff:ImageWidth");

335

336

/** Image height in pixels */

337

Property IMAGE_LENGTH = Property.internalInteger("tiff:ImageLength");

338

339

/** Bits per sample */

340

Property BITS_PER_SAMPLE = Property.internalIntegerSequence("tiff:BitsPerSample");

341

342

/** Compression type */

343

Property COMPRESSION = Property.internalInteger("tiff:Compression");

344

345

/** Color space */

346

Property COLOR_SPACE = Property.internalText("ColorSpace");

347

348

/** Resolution unit */

349

Property RESOLUTION_UNIT = Property.internalInteger("tiff:ResolutionUnit");

350

351

/** X resolution */

352

Property X_RESOLUTION = Property.internalRational("tiff:XResolution");

353

354

/** Y resolution */

355

Property Y_RESOLUTION = Property.internalRational("tiff:YResolution");

356

357

/** Orientation */

358

Property ORIENTATION = Property.internalInteger("tiff:Orientation");

359

}

360

361

/**

362

* JPEG image properties

363

*/

364

public interface JPEG {

365

/** JPEG compression quality */

366

Property COMPRESSION_QUALITY = Property.internalReal("JPEG Compression Quality");

367

368

/** Color components */

369

Property COLOR_COMPONENTS = Property.internalInteger("Number of Components");

370

371

/** Image width */

372

Property IMAGE_WIDTH = Property.internalInteger("Image Width");

373

374

/** Image height */

375

Property IMAGE_HEIGHT = Property.internalInteger("Image Height");

376

}

377

```

378

379

### Metadata Filtering

380

381

System for filtering and transforming metadata during extraction and processing.

382

383

```java { .api }

384

/**

385

* Interface for filtering metadata

386

*/

387

public interface MetadataFilter {

388

/**

389

* Filters the given metadata

390

* @param metadata Metadata to filter

391

* @param context Parse context for configuration

392

*/

393

void filter(Metadata metadata, ParseContext context) throws TikaException;

394

}

395

396

/**

397

* Composite metadata filter combining multiple filters

398

*/

399

public class CompositeMetadataFilter implements MetadataFilter {

400

/**

401

* Creates a CompositeMetadataFilter with the specified filters

402

* @param filters Array of MetadataFilter instances to combine

403

*/

404

public CompositeMetadataFilter(MetadataFilter... filters);

405

406

/**

407

* Gets the list of filters

408

* @return List of MetadataFilter instances

409

*/

410

public List<MetadataFilter> getFilters();

411

}

412

413

/**

414

* Filter that normalizes date formats

415

*/

416

public class DateNormalizingMetadataFilter implements MetadataFilter {

417

/**

418

* Creates a DateNormalizingMetadataFilter with default configuration

419

*/

420

public DateNormalizingMetadataFilter();

421

422

/**

423

* Filters metadata by normalizing date formats

424

* @param metadata Metadata to process

425

* @param context Parse context (unused)

426

*/

427

public void filter(Metadata metadata, ParseContext context) throws TikaException;

428

}

429

430

/**

431

* Filter that clears metadata based on MIME type

432

*/

433

public class ClearByMimeMetadataFilter implements MetadataFilter {

434

/**

435

* Creates a filter that clears metadata for specified MIME types

436

* @param mimeTypes Set of MediaType objects to clear metadata for

437

*/

438

public ClearByMimeMetadataFilter(Set<MediaType> mimeTypes);

439

440

/**

441

* Filters metadata by clearing it for matching MIME types

442

* @param metadata Metadata to process

443

* @param context Parse context containing MIME type information

444

*/

445

public void filter(Metadata metadata, ParseContext context) throws TikaException;

446

}

447

```

448

449

### Write Filtering

450

451

System for filtering metadata during write operations to prevent sensitive information leakage.

452

453

```java { .api }

454

/**

455

* Interface for filtering metadata during write operations

456

*/

457

public interface MetadataWriteFilter {

458

/**

459

* Filters metadata before writing

460

* @param metadata Metadata to filter

461

* @param context Write context

462

* @return Filtered metadata safe for writing

463

*/

464

Metadata filterMetadata(Metadata metadata, WriteContext context);

465

}

466

467

/**

468

* Standard write filter with common filtering rules

469

*/

470

public class StandardWriteFilter implements MetadataWriteFilter {

471

/**

472

* Creates a StandardWriteFilter with default rules

473

*/

474

public StandardWriteFilter();

475

476

/**

477

* Filters sensitive metadata before writing

478

* @param metadata Original metadata

479

* @param context Write context

480

* @return Filtered metadata

481

*/

482

public Metadata filterMetadata(Metadata metadata, WriteContext context);

483

484

/**

485

* Adds a property to the exclusion list

486

* @param property Property to exclude from output

487

*/

488

public void excludeProperty(Property property);

489

490

/**

491

* Adds a property pattern to the exclusion list

492

* @param pattern Regular expression pattern for property names to exclude

493

*/

494

public void excludePattern(String pattern);

495

}

496

```

497

498

### List Filtering

499

500

Specialized filtering for metadata containing list values.

501

502

```java { .api }

503

/**

504

* Interface for filtering metadata lists

505

*/

506

public interface MetadataListFilter {

507

/**

508

* Filters a list of metadata objects

509

* @param metadataList List of Metadata objects to filter

510

* @param context Processing context

511

* @return Filtered list of Metadata objects

512

*/

513

List<Metadata> filter(List<Metadata> metadataList, ParseContext context) throws TikaException;

514

}

515

```

516

517

## Metadata Schemas and Standards

518

519

### Standard Property Mappings

520

521

Common metadata property mappings across different standards:

522

523

```java

524

// Document title mappings

525

TikaCoreProperties.TITLE // Generic title

526

DublinCore.TITLE // Dublin Core title

527

Office.TITLE // Office document title

528

PDF.TITLE // PDF document title

529

530

// Author/Creator mappings

531

TikaCoreProperties.CREATOR // Generic creator

532

DublinCore.CREATOR // Dublin Core creator

533

Office.AUTHOR // Office document author

534

PDF.AUTHOR // PDF document author

535

536

// Date mappings

537

TikaCoreProperties.CREATED // Generic creation date

538

TikaCoreProperties.MODIFIED // Generic modification date

539

DublinCore.DATE // Dublin Core date

540

Office.CREATION_DATE // Office creation date

541

PDF.DOC_INFO_CREATION_DATE // PDF creation date

542

```

543

544

### Custom Properties

545

546

```java

547

// Working with custom properties

548

Metadata metadata = new Metadata();

549

550

// Set custom properties

551

metadata.set("custom:department", "Engineering");

552

metadata.set("custom:project", "Atlas");

553

metadata.add("custom:tags", "important");

554

metadata.add("custom:tags", "review-needed");

555

556

// Define custom property interfaces

557

public interface CustomProperties {

558

Property DEPARTMENT = Property.internalText("custom:department");

559

Property PROJECT = Property.internalText("custom:project");

560

Property TAGS = Property.internalTextBag("custom:tags");

561

}

562

```

563

564

## Advanced Metadata Operations

565

566

### Metadata Merging

567

568

```java

569

// Merge metadata from multiple sources

570

Metadata combined = new Metadata();

571

572

// Copy all properties from source metadata

573

for (String name : sourceMetadata.names()) {

574

String[] values = sourceMetadata.getValues(name);

575

for (String value : values) {

576

combined.add(name, value);

577

}

578

}

579

```

580

581

### Type-Safe Property Access

582

583

```java

584

// Type-safe property operations using Property interfaces

585

Metadata metadata = new Metadata();

586

587

// Set using Property constants

588

metadata.set(TikaCoreProperties.TITLE, "Document Title");

589

metadata.set(TikaCoreProperties.PAGE_COUNT, "150");

590

591

// Get with type conversion

592

String title = metadata.get(TikaCoreProperties.TITLE);

593

Integer pageCount = metadata.getInt(TikaCoreProperties.PAGE_COUNT);

594

Date created = metadata.getDate(TikaCoreProperties.CREATED);

595

```

596

597

## Performance and Memory Considerations

598

599

- **Property Interning**: Property names are interned to reduce memory usage

600

- **Value Storage**: Multiple values per property are stored efficiently

601

- **Filtering Performance**: Metadata filters should be lightweight operations

602

- **Memory Footprint**: Large metadata sets may require streaming processing