0
# Metadata Management
1
2
Comprehensive metadata system for extracting, storing, and manipulating document properties with support for standard metadata schemas, custom properties, and metadata filtering operations.
3
4
## Capabilities
5
6
### Metadata Container
7
8
The central container class for document metadata, providing a flexible key-value store with support for multiple values per key and standard property interfaces.
9
10
```java { .api }
11
/**
12
* Container for document metadata properties
13
*/
14
public class Metadata implements Serializable {
15
/**
16
* Creates an empty Metadata container
17
*/
18
public Metadata();
19
20
/**
21
* Gets the first value associated with the given property name
22
* @param name Property name to retrieve
23
* @return First value for the property, or null if not set
24
*/
25
public String get(String name);
26
27
/**
28
* Gets all values associated with the given property name
29
* @param name Property name to retrieve
30
* @return Array of all values for the property, never null but may be empty
31
*/
32
public String[] getValues(String name);
33
34
/**
35
* Sets a single value for the given property, replacing any existing values
36
* @param name Property name to set
37
* @param value Value to set for the property
38
*/
39
public void set(String name, String value);
40
41
/**
42
* Adds a value to the given property, preserving existing values
43
* @param name Property name to add to
44
* @param value Value to add for the property
45
*/
46
public void add(String name, String value);
47
48
/**
49
* Removes all values for the given property
50
* @param name Property name to remove
51
*/
52
public void remove(String name);
53
54
/**
55
* Gets all property names that have been set
56
* @return Array of property names with values
57
*/
58
public String[] names();
59
60
/**
61
* Gets the number of properties with values
62
* @return Number of properties that have been set
63
*/
64
public int size();
65
66
/**
67
* Checks if any properties have been set
68
* @return true if no properties have values
69
*/
70
public boolean isEmpty();
71
}
72
```
73
74
**Usage Examples:**
75
76
```java
77
import org.apache.tika.metadata.Metadata;
78
import org.apache.tika.metadata.TikaCoreProperties;
79
import org.apache.tika.metadata.DublinCore;
80
81
// Basic metadata operations
82
Metadata metadata = new Metadata();
83
84
// Set standard properties
85
metadata.set(TikaCoreProperties.TITLE, "Document Title");
86
metadata.set(DublinCore.CREATOR, "John Doe");
87
metadata.set(TikaCoreProperties.CREATED, "2023-01-15T10:30:00Z");
88
89
// Add multiple values for same property
90
metadata.add(DublinCore.SUBJECT, "Technology");
91
metadata.add(DublinCore.SUBJECT, "Programming");
92
93
// Retrieve values
94
String title = metadata.get(TikaCoreProperties.TITLE);
95
String[] subjects = metadata.getValues(DublinCore.SUBJECT);
96
97
// Iterate through all properties
98
for (String name : metadata.names()) {
99
String[] values = metadata.getValues(name);
100
System.out.println(name + ": " + Arrays.toString(values));
101
}
102
```
103
104
### Property Interfaces
105
106
Standard property definitions organized by metadata schemas and document types.
107
108
```java { .api }
109
/**
110
* Interface defining property constants
111
*/
112
public interface Property {
113
/**
114
* Gets the property name
115
* @return String name of the property
116
*/
117
String getName();
118
119
/**
120
* Checks if this property allows multiple values
121
* @return true if multiple values are allowed
122
*/
123
boolean isMultiValuePermitted();
124
}
125
126
/**
127
* Core Tika metadata properties
128
*/
129
public interface TikaCoreProperties {
130
/** Document title */
131
Property TITLE = Property.internalText("title");
132
133
/** Document creator/author */
134
Property CREATOR = Property.internalText("dc:creator");
135
136
/** Document subject/description */
137
Property SUBJECT = Property.internalText("subject");
138
139
/** Document creation date */
140
Property CREATED = Property.internalDate("dcterms:created");
141
142
/** Document modification date */
143
Property MODIFIED = Property.internalDate("dcterms:modified");
144
145
/** Content type/MIME type */
146
Property CONTENT_TYPE = Property.internalText("Content-Type");
147
148
/** Character encoding */
149
Property CONTENT_ENCODING = Property.internalText("Content-Encoding");
150
151
/** Document language */
152
Property LANGUAGE = Property.internalText("language");
153
154
/** Resource name (filename) */
155
Property RESOURCE_NAME_KEY = Property.internalText("resourceName");
156
157
/** Number of pages */
158
Property PAGE_COUNT = Property.internalInteger("xmpTPg:NPages");
159
160
/** Number of words */
161
Property WORD_COUNT = Property.internalInteger("meta:word-count");
162
163
/** Number of characters */
164
Property CHARACTER_COUNT = Property.internalInteger("meta:character-count");
165
}
166
```
167
168
### Dublin Core Properties
169
170
Standard Dublin Core metadata elements for bibliographic information.
171
172
```java { .api }
173
/**
174
* Dublin Core metadata properties
175
*/
176
public interface DublinCore {
177
/** Document contributor */
178
Property CONTRIBUTOR = Property.internalTextBag("dc:contributor");
179
180
/** Document coverage */
181
Property COVERAGE = Property.internalText("dc:coverage");
182
183
/** Document creator */
184
Property CREATOR = Property.internalTextBag("dc:creator");
185
186
/** Document date */
187
Property DATE = Property.internalDate("dc:date");
188
189
/** Document description */
190
Property DESCRIPTION = Property.internalText("dc:description");
191
192
/** Document format */
193
Property FORMAT = Property.internalText("dc:format");
194
195
/** Document identifier */
196
Property IDENTIFIER = Property.internalText("dc:identifier");
197
198
/** Document language */
199
Property LANGUAGE = Property.internalText("dc:language");
200
201
/** Document publisher */
202
Property PUBLISHER = Property.internalText("dc:publisher");
203
204
/** Document relation */
205
Property RELATION = Property.internalText("dc:relation");
206
207
/** Document rights */
208
Property RIGHTS = Property.internalText("dc:rights");
209
210
/** Document source */
211
Property SOURCE = Property.internalText("dc:source");
212
213
/** Document subject */
214
Property SUBJECT = Property.internalTextBag("dc:subject");
215
216
/** Document title */
217
Property TITLE = Property.internalText("dc:title");
218
219
/** Document type */
220
Property TYPE = Property.internalText("dc:type");
221
}
222
```
223
224
### Office Document Properties
225
226
Properties specific to office documents (Microsoft Office, LibreOffice, etc.).
227
228
```java { .api }
229
/**
230
* Generic office document properties
231
*/
232
public interface Office {
233
/** Application name that created the document */
234
Property APPLICATION = Property.internalText("Application-Name");
235
236
/** Application version */
237
Property APPLICATION_VERSION = Property.internalText("Application-Version");
238
239
/** Document category */
240
Property CATEGORY = Property.internalText("Category");
241
242
/** Document company */
243
Property COMPANY = Property.internalText("Company");
244
245
/** Document keywords */
246
Property KEYWORDS = Property.internalTextBag("Keywords");
247
248
/** Document manager */
249
Property MANAGER = Property.internalText("Manager");
250
251
/** Document comments */
252
Property COMMENTS = Property.internalText("Comments");
253
254
/** Document template */
255
Property TEMPLATE = Property.internalText("Template");
256
257
/** Total editing time */
258
Property TOTAL_TIME = Property.internalInteger("Total-Time");
259
260
/** Document revision number */
261
Property REVISION_NUMBER = Property.internalText("Revision-Number");
262
263
/** Document security level */
264
Property SECURITY = Property.internalInteger("Security");
265
266
/** Number of slides (presentations) */
267
Property SLIDE_COUNT = Property.internalInteger("Slide-Count");
268
269
/** Number of paragraphs */
270
Property PARAGRAPH_COUNT = Property.internalInteger("Paragraph-Count");
271
272
/** Number of lines */
273
Property LINE_COUNT = Property.internalInteger("Line-Count");
274
}
275
```
276
277
### PDF-Specific Properties
278
279
Properties specific to PDF documents.
280
281
```java { .api }
282
/**
283
* PDF document properties
284
*/
285
public interface PDF {
286
/** PDF version */
287
Property PDF_VERSION = Property.internalText("pdf:PDFVersion");
288
289
/** PDF producer */
290
Property PRODUCER = Property.internalText("producer");
291
292
/** PDF encryption status */
293
Property ENCRYPTED = Property.internalBoolean("pdf:encrypted");
294
295
/** PDF permissions */
296
Property PERMISSIONS = Property.internalInteger("access_permission:extract_content");
297
298
/** PDF optimization */
299
Property OPTIMIZED = Property.internalBoolean("pdf:optimized");
300
301
/** PDF tagged */
302
Property TAGGED = Property.internalBoolean("pdf:tagged");
303
304
/** Number of characters with spaces */
305
Property CHARACTERS_WITH_SPACES = Property.internalInteger("pdf:charsWithSpaces");
306
307
/** PDF/A conformance */
308
Property PDFA_VERSION = Property.internalText("pdfa:version");
309
310
/** PDF/UA compliance */
311
Property PDFUA_VERSION = Property.internalText("pdfua:version");
312
313
/** Document ID */
314
Property DOC_INFO_ID_1 = Property.internalText("pdf:docinfo:id1");
315
316
/** Modification date from PDF info */
317
Property DOC_INFO_MODIFICATION_DATE = Property.internalDate("pdf:docinfo:modified");
318
319
/** Creation date from PDF info */
320
Property DOC_INFO_CREATION_DATE = Property.internalDate("pdf:docinfo:created");
321
}
322
```
323
324
### Image Properties
325
326
Properties for image documents and embedded images.
327
328
```java { .api }
329
/**
330
* TIFF image properties
331
*/
332
public interface TIFF {
333
/** Image width in pixels */
334
Property IMAGE_WIDTH = Property.internalInteger("tiff:ImageWidth");
335
336
/** Image height in pixels */
337
Property IMAGE_LENGTH = Property.internalInteger("tiff:ImageLength");
338
339
/** Bits per sample */
340
Property BITS_PER_SAMPLE = Property.internalIntegerSequence("tiff:BitsPerSample");
341
342
/** Compression type */
343
Property COMPRESSION = Property.internalInteger("tiff:Compression");
344
345
/** Color space */
346
Property COLOR_SPACE = Property.internalText("ColorSpace");
347
348
/** Resolution unit */
349
Property RESOLUTION_UNIT = Property.internalInteger("tiff:ResolutionUnit");
350
351
/** X resolution */
352
Property X_RESOLUTION = Property.internalRational("tiff:XResolution");
353
354
/** Y resolution */
355
Property Y_RESOLUTION = Property.internalRational("tiff:YResolution");
356
357
/** Orientation */
358
Property ORIENTATION = Property.internalInteger("tiff:Orientation");
359
}
360
361
/**
362
* JPEG image properties
363
*/
364
public interface JPEG {
365
/** JPEG compression quality */
366
Property COMPRESSION_QUALITY = Property.internalReal("JPEG Compression Quality");
367
368
/** Color components */
369
Property COLOR_COMPONENTS = Property.internalInteger("Number of Components");
370
371
/** Image width */
372
Property IMAGE_WIDTH = Property.internalInteger("Image Width");
373
374
/** Image height */
375
Property IMAGE_HEIGHT = Property.internalInteger("Image Height");
376
}
377
```
378
379
### Metadata Filtering
380
381
System for filtering and transforming metadata during extraction and processing.
382
383
```java { .api }
384
/**
385
* Interface for filtering metadata
386
*/
387
public interface MetadataFilter {
388
/**
389
* Filters the given metadata
390
* @param metadata Metadata to filter
391
* @param context Parse context for configuration
392
*/
393
void filter(Metadata metadata, ParseContext context) throws TikaException;
394
}
395
396
/**
397
* Composite metadata filter combining multiple filters
398
*/
399
public class CompositeMetadataFilter implements MetadataFilter {
400
/**
401
* Creates a CompositeMetadataFilter with the specified filters
402
* @param filters Array of MetadataFilter instances to combine
403
*/
404
public CompositeMetadataFilter(MetadataFilter... filters);
405
406
/**
407
* Gets the list of filters
408
* @return List of MetadataFilter instances
409
*/
410
public List<MetadataFilter> getFilters();
411
}
412
413
/**
414
* Filter that normalizes date formats
415
*/
416
public class DateNormalizingMetadataFilter implements MetadataFilter {
417
/**
418
* Creates a DateNormalizingMetadataFilter with default configuration
419
*/
420
public DateNormalizingMetadataFilter();
421
422
/**
423
* Filters metadata by normalizing date formats
424
* @param metadata Metadata to process
425
* @param context Parse context (unused)
426
*/
427
public void filter(Metadata metadata, ParseContext context) throws TikaException;
428
}
429
430
/**
431
* Filter that clears metadata based on MIME type
432
*/
433
public class ClearByMimeMetadataFilter implements MetadataFilter {
434
/**
435
* Creates a filter that clears metadata for specified MIME types
436
* @param mimeTypes Set of MediaType objects to clear metadata for
437
*/
438
public ClearByMimeMetadataFilter(Set<MediaType> mimeTypes);
439
440
/**
441
* Filters metadata by clearing it for matching MIME types
442
* @param metadata Metadata to process
443
* @param context Parse context containing MIME type information
444
*/
445
public void filter(Metadata metadata, ParseContext context) throws TikaException;
446
}
447
```
448
449
### Write Filtering
450
451
System for filtering metadata during write operations to prevent sensitive information leakage.
452
453
```java { .api }
454
/**
455
* Interface for filtering metadata during write operations
456
*/
457
public interface MetadataWriteFilter {
458
/**
459
* Filters metadata before writing
460
* @param metadata Metadata to filter
461
* @param context Write context
462
* @return Filtered metadata safe for writing
463
*/
464
Metadata filterMetadata(Metadata metadata, WriteContext context);
465
}
466
467
/**
468
* Standard write filter with common filtering rules
469
*/
470
public class StandardWriteFilter implements MetadataWriteFilter {
471
/**
472
* Creates a StandardWriteFilter with default rules
473
*/
474
public StandardWriteFilter();
475
476
/**
477
* Filters sensitive metadata before writing
478
* @param metadata Original metadata
479
* @param context Write context
480
* @return Filtered metadata
481
*/
482
public Metadata filterMetadata(Metadata metadata, WriteContext context);
483
484
/**
485
* Adds a property to the exclusion list
486
* @param property Property to exclude from output
487
*/
488
public void excludeProperty(Property property);
489
490
/**
491
* Adds a property pattern to the exclusion list
492
* @param pattern Regular expression pattern for property names to exclude
493
*/
494
public void excludePattern(String pattern);
495
}
496
```
497
498
### List Filtering
499
500
Specialized filtering for metadata containing list values.
501
502
```java { .api }
503
/**
504
* Interface for filtering metadata lists
505
*/
506
public interface MetadataListFilter {
507
/**
508
* Filters a list of metadata objects
509
* @param metadataList List of Metadata objects to filter
510
* @param context Processing context
511
* @return Filtered list of Metadata objects
512
*/
513
List<Metadata> filter(List<Metadata> metadataList, ParseContext context) throws TikaException;
514
}
515
```
516
517
## Metadata Schemas and Standards
518
519
### Standard Property Mappings
520
521
Common metadata property mappings across different standards:
522
523
```java
524
// Document title mappings
525
TikaCoreProperties.TITLE // Generic title
526
DublinCore.TITLE // Dublin Core title
527
Office.TITLE // Office document title
528
PDF.TITLE // PDF document title
529
530
// Author/Creator mappings
531
TikaCoreProperties.CREATOR // Generic creator
532
DublinCore.CREATOR // Dublin Core creator
533
Office.AUTHOR // Office document author
534
PDF.AUTHOR // PDF document author
535
536
// Date mappings
537
TikaCoreProperties.CREATED // Generic creation date
538
TikaCoreProperties.MODIFIED // Generic modification date
539
DublinCore.DATE // Dublin Core date
540
Office.CREATION_DATE // Office creation date
541
PDF.DOC_INFO_CREATION_DATE // PDF creation date
542
```
543
544
### Custom Properties
545
546
```java
547
// Working with custom properties
548
Metadata metadata = new Metadata();
549
550
// Set custom properties
551
metadata.set("custom:department", "Engineering");
552
metadata.set("custom:project", "Atlas");
553
metadata.add("custom:tags", "important");
554
metadata.add("custom:tags", "review-needed");
555
556
// Define custom property interfaces
557
public interface CustomProperties {
558
Property DEPARTMENT = Property.internalText("custom:department");
559
Property PROJECT = Property.internalText("custom:project");
560
Property TAGS = Property.internalTextBag("custom:tags");
561
}
562
```
563
564
## Advanced Metadata Operations
565
566
### Metadata Merging
567
568
```java
569
// Merge metadata from multiple sources
570
Metadata combined = new Metadata();
571
572
// Copy all properties from source metadata
573
for (String name : sourceMetadata.names()) {
574
String[] values = sourceMetadata.getValues(name);
575
for (String value : values) {
576
combined.add(name, value);
577
}
578
}
579
```
580
581
### Type-Safe Property Access
582
583
```java
584
// Type-safe property operations using Property interfaces
585
Metadata metadata = new Metadata();
586
587
// Set using Property constants
588
metadata.set(TikaCoreProperties.TITLE, "Document Title");
589
metadata.set(TikaCoreProperties.PAGE_COUNT, "150");
590
591
// Get with type conversion
592
String title = metadata.get(TikaCoreProperties.TITLE);
593
Integer pageCount = metadata.getInt(TikaCoreProperties.PAGE_COUNT);
594
Date created = metadata.getDate(TikaCoreProperties.CREATED);
595
```
596
597
## Performance and Memory Considerations
598
599
- **Property Interning**: Property names are interned to reduce memory usage
600
- **Value Storage**: Multiple values per property are stored efficiently
601
- **Filtering Performance**: Metadata filters should be lightweight operations
602
- **Memory Footprint**: Large metadata sets may require streaming processing