or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcontent-processing.mddetection.mdembedded-extraction.mdembedding.mdexceptions.mdindex.mdio-utilities.mdlanguage.mdmetadata.mdmime-types.mdparsing.mdpipes.mdprocess-forking.mdrendering.md

embedding.mddocs/

0

# Document Embedding

1

2

Framework for embedding metadata into documents, allowing modification and insertion of metadata properties into existing files. This system provides both programmatic interfaces and external tool integration for embedding metadata into various document formats while preserving document structure and content.

3

4

## Capabilities

5

6

### Embedder Interface

7

8

Core interface for embedding metadata into documents with support for different document formats and metadata containers.

9

10

```java { .api }

11

/**

12

* Interface for embedding metadata into documents

13

*/

14

public interface Embedder extends Serializable {

15

/**

16

* Returns supported media types for embedding operations

17

* @param context parse context for embedder configuration

18

* @return immutable set of supported media types

19

*/

20

Set<MediaType> getSupportedEmbedTypes(ParseContext context);

21

22

/**

23

* Embeds metadata from Metadata object into document stream

24

* @param metadata document metadata to embed (input and output)

25

* @param originalStream source document stream

26

* @param outputStream target stream for document with embedded metadata

27

* @param context parse context for embedding configuration

28

* @throws IOException if document cannot be read or written

29

* @throws TikaException if embedding operation fails

30

*/

31

void embed(Metadata metadata, InputStream originalStream, OutputStream outputStream,

32

ParseContext context) throws IOException, TikaException;

33

}

34

```

35

36

### External Embedder

37

38

Implementation that uses external command-line tools for embedding metadata into documents.

39

40

```java { .api }

41

/**

42

* Embedder using external programs for metadata embedding

43

*/

44

public class ExternalEmbedder implements Embedder {

45

/** Token replaced with metadata command arguments in command templates */

46

public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}";

47

48

/** Token replaced with serialized metadata arguments in command templates */

49

public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = "${METADATA_SERIALIZED}";

50

51

/**

52

* Creates external embedder with default settings

53

*/

54

public ExternalEmbedder();

55

56

/**

57

* Gets supported embed types

58

* @return set of supported media types

59

*/

60

public Set<MediaType> getSupportedEmbedTypes();

61

62

/**

63

* Sets supported embed types for this embedder

64

* @param supportedEmbedTypes set of media types to support

65

*/

66

public void setSupportedEmbedTypes(Set<MediaType> supportedEmbedTypes);

67

68

/**

69

* Gets external command to execute

70

* @return command array with tokens for file paths

71

*/

72

public String[] getCommand();

73

74

/**

75

* Sets external command to execute for embedding

76

* @param command command array supporting INPUT_FILE_TOKEN and OUTPUT_FILE_TOKEN

77

*/

78

public void setCommand(String... command);

79

80

/**

81

* Gets assignment operator for metadata (e.g., "=")

82

* @return assignment operator string

83

*/

84

public String getCommandAssignmentOperator();

85

86

/**

87

* Sets assignment operator for metadata

88

* @param commandAssignmentOperator operator string

89

*/

90

public void setCommandAssignmentOperator(String commandAssignmentOperator);

91

92

/**

93

* Gets delimiter for multiple metadata assignments (e.g., ", ")

94

* @return assignment delimiter string

95

*/

96

public String getCommandAssignmentDelimeter();

97

98

/**

99

* Sets delimiter for multiple metadata assignments

100

* @param commandAssignmentDelimeter delimiter string

101

*/

102

public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter);

103

104

/**

105

* Gets append operator for metadata (e.g., "+=")

106

* @return append operator string

107

*/

108

public String getCommandAppendOperator();

109

110

/**

111

* Sets append operator for multi-valued metadata

112

* @param commandAppendOperator append operator string

113

*/

114

public void setCommandAppendOperator(String commandAppendOperator);

115

116

/**

117

* Gets whether to quote assignment values

118

* @return true if values should be quoted

119

*/

120

public boolean isQuoteAssignmentValues();

121

122

/**

123

* Sets whether to quote assignment values (e.g., tag='value')

124

* @param quoteAssignmentValues true to quote values

125

*/

126

public void setQuoteAssignmentValues(boolean quoteAssignmentValues);

127

128

/**

129

* Gets metadata property to command line parameter mapping

130

* @return mapping of Tika properties to command arguments

131

*/

132

public Map<Property, String[]> getMetadataCommandArguments();

133

134

/**

135

* Sets metadata property to command line parameter mapping

136

* @param arguments mapping of properties to command line parameters

137

*/

138

public void setMetadataCommandArguments(Map<Property, String[]> arguments);

139

}

140

```

141

142

### Utility Methods

143

144

Static utility methods for working with external embedders.

145

146

```java { .api }

147

/**

148

* Utility methods for external embedder operations

149

*/

150

public class ExternalEmbedder {

151

/**

152

* Checks if external command is available and functional

153

* @param checkCmd command to test (e.g., "exiftool --version")

154

* @param errorValue error codes that indicate failure

155

* @return true if command executes successfully

156

*/

157

public static boolean check(String checkCmd, int... errorValue);

158

159

/**

160

* Checks if external command array is available and functional

161

* @param checkCmd command array to test

162

* @param errorValue error codes that indicate failure

163

* @return true if command executes successfully

164

*/

165

public static boolean check(String[] checkCmd, int... errorValue);

166

}

167

```

168

169

## Usage Examples

170

171

**Basic Metadata Embedding with ExifTool:**

172

173

```java

174

import org.apache.tika.embedder.*;

175

import org.apache.tika.metadata.*;

176

import org.apache.tika.mime.MediaType;

177

import org.apache.tika.parser.ParseContext;

178

import java.io.*;

179

import java.util.*;

180

181

// Check if exiftool is available

182

if (ExternalEmbedder.check("exiftool", "-ver")) {

183

// Create embedder for JPEG images

184

ExternalEmbedder embedder = new ExternalEmbedder();

185

186

// Configure supported types

187

Set<MediaType> supportedTypes = new HashSet<>();

188

supportedTypes.add(MediaType.image("jpeg"));

189

supportedTypes.add(MediaType.image("tiff"));

190

embedder.setSupportedEmbedTypes(supportedTypes);

191

192

// Configure exiftool command

193

embedder.setCommand("exiftool",

194

"-overwrite_original",

195

"${METADATA}",

196

"${INPUT_FILE}");

197

198

// Map Tika metadata to exiftool parameters

199

Map<Property, String[]> metadataMapping = new HashMap<>();

200

metadataMapping.put(TikaCoreProperties.TITLE, new String[]{"-Title"});

201

metadataMapping.put(TikaCoreProperties.CREATOR, new String[]{"-Artist", "-Author"});

202

metadataMapping.put(TikaCoreProperties.SUBJECT, new String[]{"-Subject"});

203

metadataMapping.put(TikaCoreProperties.DESCRIPTION, new String[]{"-Description"});

204

embedder.setMetadataCommandArguments(metadataMapping);

205

206

// Prepare metadata to embed

207

Metadata metadata = new Metadata();

208

metadata.set(TikaCoreProperties.TITLE, "Sunset Over Mountains");

209

metadata.set(TikaCoreProperties.CREATOR, "John Photographer");

210

metadata.set(TikaCoreProperties.SUBJECT, "Nature Photography");

211

metadata.set(TikaCoreProperties.DESCRIPTION, "Beautiful sunset captured in the Rocky Mountains");

212

213

// Embed metadata into image

214

try (InputStream input = new FileInputStream("original.jpg");

215

OutputStream output = new FileOutputStream("with_metadata.jpg")) {

216

217

embedder.embed(metadata, input, output, new ParseContext());

218

System.out.println("Metadata successfully embedded");

219

}

220

}

221

```

222

223

**PDF Metadata Embedding with pdftk:**

224

225

```java

226

// Configure embedder for PDF documents

227

ExternalEmbedder pdfEmbedder = new ExternalEmbedder();

228

229

// Set supported type

230

pdfEmbedder.setSupportedEmbedTypes(Set.of(MediaType.application("pdf")));

231

232

// Configure pdftk command with metadata file approach

233

pdfEmbedder.setCommand("pdftk", "${INPUT_FILE}",

234

"update_info_utf8", "metadata.txt",

235

"output", "${OUTPUT_FILE}");

236

237

// Configure metadata mapping for PDF

238

Map<Property, String[]> pdfMapping = new HashMap<>();

239

pdfMapping.put(TikaCoreProperties.TITLE, new String[]{"InfoKey: Title\nInfoValue: "});

240

pdfMapping.put(TikaCoreProperties.CREATOR, new String[]{"InfoKey: Author\nInfoValue: "});

241

pdfMapping.put(TikaCoreProperties.SUBJECT, new String[]{"InfoKey: Subject\nInfoValue: "});

242

pdfEmbedder.setMetadataCommandArguments(pdfMapping);

243

244

// Prepare PDF metadata

245

Metadata pdfMetadata = new Metadata();

246

pdfMetadata.set(TikaCoreProperties.TITLE, "Technical Documentation");

247

pdfMetadata.set(TikaCoreProperties.CREATOR, "Engineering Team");

248

pdfMetadata.set(TikaCoreProperties.SUBJECT, "API Reference Manual");

249

250

// Embed metadata

251

try (InputStream input = new FileInputStream("document.pdf");

252

OutputStream output = new FileOutputStream("document_with_metadata.pdf")) {

253

254

pdfEmbedder.embed(pdfMetadata, input, output, new ParseContext());

255

}

256

```

257

258

**Custom Embedder Implementation:**

259

260

```java

261

/**

262

* Custom embedder for a specific document format

263

*/

264

public class CustomDocumentEmbedder implements Embedder {

265

private final Set<MediaType> supportedTypes;

266

267

public CustomDocumentEmbedder() {

268

this.supportedTypes = Set.of(MediaType.parse("application/x-custom"));

269

}

270

271

@Override

272

public Set<MediaType> getSupportedEmbedTypes(ParseContext context) {

273

return supportedTypes;

274

}

275

276

@Override

277

public void embed(Metadata metadata, InputStream originalStream,

278

OutputStream outputStream, ParseContext context)

279

throws IOException, TikaException {

280

281

// Read original document

282

byte[] originalData = originalStream.readAllBytes();

283

284

// Create metadata section

285

StringBuilder metadataSection = new StringBuilder();

286

for (String name : metadata.names()) {

287

String[] values = metadata.getValues(name);

288

for (String value : values) {

289

metadataSection.append(name).append("=").append(value).append("\n");

290

}

291

}

292

293

// Write document with embedded metadata

294

outputStream.write("METADATA_START\n".getBytes());

295

outputStream.write(metadataSection.toString().getBytes());

296

outputStream.write("METADATA_END\n".getBytes());

297

outputStream.write(originalData);

298

299

System.out.println("Custom metadata embedding completed");

300

}

301

}

302

303

// Usage

304

CustomDocumentEmbedder customEmbedder = new CustomDocumentEmbedder();

305

Metadata customMetadata = new Metadata();

306

customMetadata.set("custom-field", "custom-value");

307

customMetadata.set(TikaCoreProperties.TITLE, "Custom Document");

308

309

try (InputStream input = new FileInputStream("custom.doc");

310

OutputStream output = new FileOutputStream("custom_with_metadata.doc")) {

311

customEmbedder.embed(customMetadata, input, output, new ParseContext());

312

}

313

```

314

315

**Advanced External Tool Configuration:**

316

317

```java

318

// Configure embedder with complex command structure

319

ExternalEmbedder advancedEmbedder = new ExternalEmbedder();

320

321

// Set multiple supported formats

322

Set<MediaType> formats = new HashSet<>();

323

formats.add(MediaType.image("jpeg"));

324

formats.add(MediaType.image("png"));

325

formats.add(MediaType.image("tiff"));

326

advancedEmbedder.setSupportedEmbedTypes(formats);

327

328

// Configure advanced exiftool command with serialized metadata

329

advancedEmbedder.setCommand("exiftool",

330

"-config", "custom.config",

331

"-overwrite_original",

332

"-charset", "utf8",

333

"${METADATA_SERIALIZED}",

334

"${INPUT_FILE}");

335

336

// Configure quote handling and operators

337

advancedEmbedder.setQuoteAssignmentValues(true);

338

advancedEmbedder.setCommandAssignmentOperator("=");

339

advancedEmbedder.setCommandAppendOperator("+=");

340

advancedEmbedder.setCommandAssignmentDelimeter(" ");

341

342

// Create comprehensive metadata mapping

343

Map<Property, String[]> comprehensiveMapping = new HashMap<>();

344

comprehensiveMapping.put(TikaCoreProperties.TITLE, new String[]{"-Title", "-XMP:Title"});

345

comprehensiveMapping.put(TikaCoreProperties.CREATOR, new String[]{"-Artist", "-XMP:Creator"});

346

comprehensiveMapping.put(TikaCoreProperties.KEYWORDS, new String[]{"-Keywords", "-XMP:Keywords"});

347

comprehensiveMapping.put(Geographic.LATITUDE, new String[]{"-GPSLatitude"});

348

comprehensiveMapping.put(Geographic.LONGITUDE, new String[]{"-GPSLongitude"});

349

advancedEmbedder.setMetadataCommandArguments(comprehensiveMapping);

350

351

// Embed comprehensive metadata

352

Metadata richMetadata = new Metadata();

353

richMetadata.set(TikaCoreProperties.TITLE, "Mountain Landscape");

354

richMetadata.set(TikaCoreProperties.CREATOR, "Nature Photographer");

355

richMetadata.add(TikaCoreProperties.KEYWORDS, "mountain");

356

richMetadata.add(TikaCoreProperties.KEYWORDS, "landscape");

357

richMetadata.add(TikaCoreProperties.KEYWORDS, "nature");

358

richMetadata.set(Geographic.LATITUDE, "40.7128");

359

richMetadata.set(Geographic.LONGITUDE, "-74.0060");

360

361

try (InputStream input = new FileInputStream("landscape.jpg");

362

OutputStream output = new FileOutputStream("landscape_enriched.jpg")) {

363

364

advancedEmbedder.embed(richMetadata, input, output, new ParseContext());

365

System.out.println("Rich metadata embedding completed");

366

}

367

```

368

369

The embedding framework provides flexible metadata insertion capabilities with support for external tools, custom implementations, and comprehensive metadata mapping strategies while preserving document integrity and supporting various file formats.