0
# Document Embedding
1
2
Framework for embedding metadata into documents, allowing modification and insertion of metadata properties into existing files. This system provides both programmatic interfaces and external tool integration for embedding metadata into various document formats while preserving document structure and content.
3
4
## Capabilities
5
6
### Embedder Interface
7
8
Core interface for embedding metadata into documents with support for different document formats and metadata containers.
9
10
```java { .api }
11
/**
12
* Interface for embedding metadata into documents
13
*/
14
public interface Embedder extends Serializable {
15
/**
16
* Returns supported media types for embedding operations
17
* @param context parse context for embedder configuration
18
* @return immutable set of supported media types
19
*/
20
Set<MediaType> getSupportedEmbedTypes(ParseContext context);
21
22
/**
23
* Embeds metadata from Metadata object into document stream
24
* @param metadata document metadata to embed (input and output)
25
* @param originalStream source document stream
26
* @param outputStream target stream for document with embedded metadata
27
* @param context parse context for embedding configuration
28
* @throws IOException if document cannot be read or written
29
* @throws TikaException if embedding operation fails
30
*/
31
void embed(Metadata metadata, InputStream originalStream, OutputStream outputStream,
32
ParseContext context) throws IOException, TikaException;
33
}
34
```
35
36
### External Embedder
37
38
Implementation that uses external command-line tools for embedding metadata into documents.
39
40
```java { .api }
41
/**
42
* Embedder using external programs for metadata embedding
43
*/
44
public class ExternalEmbedder implements Embedder {
45
/** Token replaced with metadata command arguments in command templates */
46
public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}";
47
48
/** Token replaced with serialized metadata arguments in command templates */
49
public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = "${METADATA_SERIALIZED}";
50
51
/**
52
* Creates external embedder with default settings
53
*/
54
public ExternalEmbedder();
55
56
/**
57
* Gets supported embed types
58
* @return set of supported media types
59
*/
60
public Set<MediaType> getSupportedEmbedTypes();
61
62
/**
63
* Sets supported embed types for this embedder
64
* @param supportedEmbedTypes set of media types to support
65
*/
66
public void setSupportedEmbedTypes(Set<MediaType> supportedEmbedTypes);
67
68
/**
69
* Gets external command to execute
70
* @return command array with tokens for file paths
71
*/
72
public String[] getCommand();
73
74
/**
75
* Sets external command to execute for embedding
76
* @param command command array supporting INPUT_FILE_TOKEN and OUTPUT_FILE_TOKEN
77
*/
78
public void setCommand(String... command);
79
80
/**
81
* Gets assignment operator for metadata (e.g., "=")
82
* @return assignment operator string
83
*/
84
public String getCommandAssignmentOperator();
85
86
/**
87
* Sets assignment operator for metadata
88
* @param commandAssignmentOperator operator string
89
*/
90
public void setCommandAssignmentOperator(String commandAssignmentOperator);
91
92
/**
93
* Gets delimiter for multiple metadata assignments (e.g., ", ")
94
* @return assignment delimiter string
95
*/
96
public String getCommandAssignmentDelimeter();
97
98
/**
99
* Sets delimiter for multiple metadata assignments
100
* @param commandAssignmentDelimeter delimiter string
101
*/
102
public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter);
103
104
/**
105
* Gets append operator for metadata (e.g., "+=")
106
* @return append operator string
107
*/
108
public String getCommandAppendOperator();
109
110
/**
111
* Sets append operator for multi-valued metadata
112
* @param commandAppendOperator append operator string
113
*/
114
public void setCommandAppendOperator(String commandAppendOperator);
115
116
/**
117
* Gets whether to quote assignment values
118
* @return true if values should be quoted
119
*/
120
public boolean isQuoteAssignmentValues();
121
122
/**
123
* Sets whether to quote assignment values (e.g., tag='value')
124
* @param quoteAssignmentValues true to quote values
125
*/
126
public void setQuoteAssignmentValues(boolean quoteAssignmentValues);
127
128
/**
129
* Gets metadata property to command line parameter mapping
130
* @return mapping of Tika properties to command arguments
131
*/
132
public Map<Property, String[]> getMetadataCommandArguments();
133
134
/**
135
* Sets metadata property to command line parameter mapping
136
* @param arguments mapping of properties to command line parameters
137
*/
138
public void setMetadataCommandArguments(Map<Property, String[]> arguments);
139
}
140
```
141
142
### Utility Methods
143
144
Static utility methods for working with external embedders.
145
146
```java { .api }
147
/**
148
* Utility methods for external embedder operations
149
*/
150
public class ExternalEmbedder {
151
/**
152
* Checks if external command is available and functional
153
* @param checkCmd command to test (e.g., "exiftool --version")
154
* @param errorValue error codes that indicate failure
155
* @return true if command executes successfully
156
*/
157
public static boolean check(String checkCmd, int... errorValue);
158
159
/**
160
* Checks if external command array is available and functional
161
* @param checkCmd command array to test
162
* @param errorValue error codes that indicate failure
163
* @return true if command executes successfully
164
*/
165
public static boolean check(String[] checkCmd, int... errorValue);
166
}
167
```
168
169
## Usage Examples
170
171
**Basic Metadata Embedding with ExifTool:**
172
173
```java
174
import org.apache.tika.embedder.*;
175
import org.apache.tika.metadata.*;
176
import org.apache.tika.mime.MediaType;
177
import org.apache.tika.parser.ParseContext;
178
import java.io.*;
179
import java.util.*;
180
181
// Check if exiftool is available
182
if (ExternalEmbedder.check("exiftool", "-ver")) {
183
// Create embedder for JPEG images
184
ExternalEmbedder embedder = new ExternalEmbedder();
185
186
// Configure supported types
187
Set<MediaType> supportedTypes = new HashSet<>();
188
supportedTypes.add(MediaType.image("jpeg"));
189
supportedTypes.add(MediaType.image("tiff"));
190
embedder.setSupportedEmbedTypes(supportedTypes);
191
192
// Configure exiftool command
193
embedder.setCommand("exiftool",
194
"-overwrite_original",
195
"${METADATA}",
196
"${INPUT_FILE}");
197
198
// Map Tika metadata to exiftool parameters
199
Map<Property, String[]> metadataMapping = new HashMap<>();
200
metadataMapping.put(TikaCoreProperties.TITLE, new String[]{"-Title"});
201
metadataMapping.put(TikaCoreProperties.CREATOR, new String[]{"-Artist", "-Author"});
202
metadataMapping.put(TikaCoreProperties.SUBJECT, new String[]{"-Subject"});
203
metadataMapping.put(TikaCoreProperties.DESCRIPTION, new String[]{"-Description"});
204
embedder.setMetadataCommandArguments(metadataMapping);
205
206
// Prepare metadata to embed
207
Metadata metadata = new Metadata();
208
metadata.set(TikaCoreProperties.TITLE, "Sunset Over Mountains");
209
metadata.set(TikaCoreProperties.CREATOR, "John Photographer");
210
metadata.set(TikaCoreProperties.SUBJECT, "Nature Photography");
211
metadata.set(TikaCoreProperties.DESCRIPTION, "Beautiful sunset captured in the Rocky Mountains");
212
213
// Embed metadata into image
214
try (InputStream input = new FileInputStream("original.jpg");
215
OutputStream output = new FileOutputStream("with_metadata.jpg")) {
216
217
embedder.embed(metadata, input, output, new ParseContext());
218
System.out.println("Metadata successfully embedded");
219
}
220
}
221
```
222
223
**PDF Metadata Embedding with pdftk:**
224
225
```java
226
// Configure embedder for PDF documents
227
ExternalEmbedder pdfEmbedder = new ExternalEmbedder();
228
229
// Set supported type
230
pdfEmbedder.setSupportedEmbedTypes(Set.of(MediaType.application("pdf")));
231
232
// Configure pdftk command with metadata file approach
233
pdfEmbedder.setCommand("pdftk", "${INPUT_FILE}",
234
"update_info_utf8", "metadata.txt",
235
"output", "${OUTPUT_FILE}");
236
237
// Configure metadata mapping for PDF
238
Map<Property, String[]> pdfMapping = new HashMap<>();
239
pdfMapping.put(TikaCoreProperties.TITLE, new String[]{"InfoKey: Title\nInfoValue: "});
240
pdfMapping.put(TikaCoreProperties.CREATOR, new String[]{"InfoKey: Author\nInfoValue: "});
241
pdfMapping.put(TikaCoreProperties.SUBJECT, new String[]{"InfoKey: Subject\nInfoValue: "});
242
pdfEmbedder.setMetadataCommandArguments(pdfMapping);
243
244
// Prepare PDF metadata
245
Metadata pdfMetadata = new Metadata();
246
pdfMetadata.set(TikaCoreProperties.TITLE, "Technical Documentation");
247
pdfMetadata.set(TikaCoreProperties.CREATOR, "Engineering Team");
248
pdfMetadata.set(TikaCoreProperties.SUBJECT, "API Reference Manual");
249
250
// Embed metadata
251
try (InputStream input = new FileInputStream("document.pdf");
252
OutputStream output = new FileOutputStream("document_with_metadata.pdf")) {
253
254
pdfEmbedder.embed(pdfMetadata, input, output, new ParseContext());
255
}
256
```
257
258
**Custom Embedder Implementation:**
259
260
```java
261
/**
262
* Custom embedder for a specific document format
263
*/
264
public class CustomDocumentEmbedder implements Embedder {
265
private final Set<MediaType> supportedTypes;
266
267
public CustomDocumentEmbedder() {
268
this.supportedTypes = Set.of(MediaType.parse("application/x-custom"));
269
}
270
271
@Override
272
public Set<MediaType> getSupportedEmbedTypes(ParseContext context) {
273
return supportedTypes;
274
}
275
276
@Override
277
public void embed(Metadata metadata, InputStream originalStream,
278
OutputStream outputStream, ParseContext context)
279
throws IOException, TikaException {
280
281
// Read original document
282
byte[] originalData = originalStream.readAllBytes();
283
284
// Create metadata section
285
StringBuilder metadataSection = new StringBuilder();
286
for (String name : metadata.names()) {
287
String[] values = metadata.getValues(name);
288
for (String value : values) {
289
metadataSection.append(name).append("=").append(value).append("\n");
290
}
291
}
292
293
// Write document with embedded metadata
294
outputStream.write("METADATA_START\n".getBytes());
295
outputStream.write(metadataSection.toString().getBytes());
296
outputStream.write("METADATA_END\n".getBytes());
297
outputStream.write(originalData);
298
299
System.out.println("Custom metadata embedding completed");
300
}
301
}
302
303
// Usage
304
CustomDocumentEmbedder customEmbedder = new CustomDocumentEmbedder();
305
Metadata customMetadata = new Metadata();
306
customMetadata.set("custom-field", "custom-value");
307
customMetadata.set(TikaCoreProperties.TITLE, "Custom Document");
308
309
try (InputStream input = new FileInputStream("custom.doc");
310
OutputStream output = new FileOutputStream("custom_with_metadata.doc")) {
311
customEmbedder.embed(customMetadata, input, output, new ParseContext());
312
}
313
```
314
315
**Advanced External Tool Configuration:**
316
317
```java
318
// Configure embedder with complex command structure
319
ExternalEmbedder advancedEmbedder = new ExternalEmbedder();
320
321
// Set multiple supported formats
322
Set<MediaType> formats = new HashSet<>();
323
formats.add(MediaType.image("jpeg"));
324
formats.add(MediaType.image("png"));
325
formats.add(MediaType.image("tiff"));
326
advancedEmbedder.setSupportedEmbedTypes(formats);
327
328
// Configure advanced exiftool command with serialized metadata
329
advancedEmbedder.setCommand("exiftool",
330
"-config", "custom.config",
331
"-overwrite_original",
332
"-charset", "utf8",
333
"${METADATA_SERIALIZED}",
334
"${INPUT_FILE}");
335
336
// Configure quote handling and operators
337
advancedEmbedder.setQuoteAssignmentValues(true);
338
advancedEmbedder.setCommandAssignmentOperator("=");
339
advancedEmbedder.setCommandAppendOperator("+=");
340
advancedEmbedder.setCommandAssignmentDelimeter(" ");
341
342
// Create comprehensive metadata mapping
343
Map<Property, String[]> comprehensiveMapping = new HashMap<>();
344
comprehensiveMapping.put(TikaCoreProperties.TITLE, new String[]{"-Title", "-XMP:Title"});
345
comprehensiveMapping.put(TikaCoreProperties.CREATOR, new String[]{"-Artist", "-XMP:Creator"});
346
comprehensiveMapping.put(TikaCoreProperties.KEYWORDS, new String[]{"-Keywords", "-XMP:Keywords"});
347
comprehensiveMapping.put(Geographic.LATITUDE, new String[]{"-GPSLatitude"});
348
comprehensiveMapping.put(Geographic.LONGITUDE, new String[]{"-GPSLongitude"});
349
advancedEmbedder.setMetadataCommandArguments(comprehensiveMapping);
350
351
// Embed comprehensive metadata
352
Metadata richMetadata = new Metadata();
353
richMetadata.set(TikaCoreProperties.TITLE, "Mountain Landscape");
354
richMetadata.set(TikaCoreProperties.CREATOR, "Nature Photographer");
355
richMetadata.add(TikaCoreProperties.KEYWORDS, "mountain");
356
richMetadata.add(TikaCoreProperties.KEYWORDS, "landscape");
357
richMetadata.add(TikaCoreProperties.KEYWORDS, "nature");
358
richMetadata.set(Geographic.LATITUDE, "40.7128");
359
richMetadata.set(Geographic.LONGITUDE, "-74.0060");
360
361
try (InputStream input = new FileInputStream("landscape.jpg");
362
OutputStream output = new FileOutputStream("landscape_enriched.jpg")) {
363
364
advancedEmbedder.embed(richMetadata, input, output, new ParseContext());
365
System.out.println("Rich metadata embedding completed");
366
}
367
```
368
369
The embedding framework provides flexible metadata insertion capabilities with support for external tools, custom implementations, and comprehensive metadata mapping strategies while preserving document integrity and supporting various file formats.