or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcontent-processing.mddetection.mdembedded-extraction.mdembedding.mdexceptions.mdindex.mdio-utilities.mdlanguage.mdmetadata.mdmime-types.mdparsing.mdpipes.mdprocess-forking.mdrendering.md

embedded-extraction.mddocs/

0

# Embedded Document Extraction

1

2

Framework for extracting embedded documents and resources from container formats such as ZIP archives, Microsoft Office documents, and other compound document formats. Provides both high-level extraction APIs and low-level container processing capabilities with support for nested containers, selective extraction, and custom processing strategies.

3

4

## Capabilities

5

6

### Embedded Document Extractor

7

8

Core interface for extracting and processing embedded documents within container formats.

9

10

```java { .api }

11

/**

12

* Interface for extracting embedded documents from container formats

13

*/

14

public interface EmbeddedDocumentExtractor {

15

/**

16

* Determines whether an embedded document should be parsed

17

* @param metadata metadata of the embedded document

18

* @return true if the document should be processed

19

*/

20

boolean shouldParseEmbedded(Metadata metadata);

21

22

/**

23

* Processes embedded resource with appropriate parsing

24

* @param stream input stream containing embedded document

25

* @param handler SAX content handler for output

26

* @param metadata metadata for the embedded resource

27

* @param outputHtml whether to output HTML format

28

* @throws SAXException if SAX processing fails

29

* @throws IOException if I/O error occurs

30

*/

31

void parseEmbedded(InputStream stream, ContentHandler handler,

32

Metadata metadata, boolean outputHtml)

33

throws SAXException, IOException;

34

}

35

36

/**

37

* Factory interface for creating embedded document extractors

38

*/

39

public interface EmbeddedDocumentExtractorFactory extends Serializable {

40

/**

41

* Creates new embedded document extractor instance

42

* @param metadata parent document metadata

43

* @param parseContext parsing context

44

* @return configured extractor instance

45

*/

46

EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext);

47

}

48

```

49

50

### Parsing Embedded Document Extractor

51

52

Default implementation that uses Tika parsers to process embedded documents.

53

54

```java { .api }

55

/**

56

* Parser-based embedded document extractor for compound documents

57

*/

58

public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {

59

/**

60

* Creates extractor with parsing context

61

* @param context parse context containing configuration

62

*/

63

public ParsingEmbeddedDocumentExtractor(ParseContext context);

64

65

/**

66

* Sets whether to write filename to content output

67

* @param writeFileNameToContent true to include filenames in output

68

*/

69

public void setWriteFileNameToContent(boolean writeFileNameToContent);

70

71

/**

72

* Gets filename writing configuration

73

* @return true if filenames are written to content

74

*/

75

public boolean getWriteFileNameToContent();

76

}

77

78

/**

79

* Factory for creating parsing embedded document extractors

80

*/

81

public class ParsingEmbeddedDocumentExtractorFactory

82

implements EmbeddedDocumentExtractorFactory {

83

/**

84

* Creates factory instance

85

*/

86

public ParsingEmbeddedDocumentExtractorFactory();

87

}

88

```

89

90

### Container Extractor

91

92

Low-level interface for extracting resources from container formats.

93

94

```java { .api }

95

/**

96

* Interface for extracting embedded resources from container formats

97

*/

98

public interface ContainerExtractor extends Serializable {

99

/**

100

* Checks if extractor supports the container format

101

* @param input Tika input stream to examine

102

* @return true if this extractor can process the container

103

* @throws IOException if stream cannot be read

104

*/

105

boolean isSupported(TikaInputStream input) throws IOException;

106

107

/**

108

* Extracts all embedded resources from container

109

* @param stream document stream to process

110

* @param recurseExtractor extractor for nested containers

111

* @param handler handler for processing extracted resources

112

* @throws IOException if stream cannot be read

113

* @throws TikaException if container cannot be parsed

114

*/

115

void extract(TikaInputStream stream, ContainerExtractor recurseExtractor,

116

EmbeddedResourceHandler handler) throws IOException, TikaException;

117

}

118

119

/**

120

* Parser-based container extractor implementation

121

*/

122

public class ParserContainerExtractor implements ContainerExtractor {

123

/**

124

* Creates parser-based container extractor

125

*/

126

public ParserContainerExtractor();

127

128

/**

129

* Creates parser-based extractor with custom parser

130

* @param parser parser to use for extraction

131

*/

132

public ParserContainerExtractor(Parser parser);

133

}

134

```

135

136

### Resource Handlers

137

138

Interfaces for processing extracted embedded resources.

139

140

```java { .api }

141

/**

142

* Callback interface for handling extracted embedded resources

143

*/

144

public interface EmbeddedResourceHandler {

145

/**

146

* Processes an embedded resource

147

* @param filename filename of embedded resource (if known)

148

* @param mediaType media type of resource (if known)

149

* @param stream input stream containing resource content

150

*/

151

void handle(String filename, MediaType mediaType, InputStream stream);

152

}

153

154

/**

155

* Interface for handling embedded document bytes

156

*/

157

public interface EmbeddedDocumentBytesHandler {

158

/**

159

* Processes bytes from embedded document

160

* @param embeddedDocumentBytes bytes from embedded document

161

* @param metadata metadata for the embedded document

162

* @throws IOException if processing fails

163

* @throws TikaException if document processing fails

164

*/

165

void handleEmbeddedDocumentBytes(byte[] embeddedDocumentBytes, Metadata metadata)

166

throws IOException, TikaException;

167

}

168

169

/**

170

* Abstract base class for embedded document bytes handlers

171

*/

172

public abstract class AbstractEmbeddedDocumentBytesHandler

173

implements EmbeddedDocumentBytesHandler {

174

/**

175

* Creates handler with temporary resources

176

* @param temporaryResources temporary resource manager

177

*/

178

public AbstractEmbeddedDocumentBytesHandler(TemporaryResources temporaryResources);

179

}

180

181

/**

182

* Basic implementation of embedded document bytes handler

183

*/

184

public class BasicEmbeddedDocumentBytesHandler

185

extends AbstractEmbeddedDocumentBytesHandler {

186

/**

187

* Creates basic bytes handler

188

* @param temporaryResources temporary resource manager

189

*/

190

public BasicEmbeddedDocumentBytesHandler(TemporaryResources temporaryResources);

191

}

192

```

193

194

### Document Selection

195

196

Interfaces for controlling which embedded documents to process.

197

198

```java { .api }

199

/**

200

* Interface for document selection strategies

201

*/

202

public interface DocumentSelector {

203

/**

204

* Determines if document should be selected for processing

205

* @param metadata document metadata to evaluate

206

* @return true if document matches selection criteria

207

*/

208

boolean select(Metadata metadata);

209

}

210

211

/**

212

* Interface for selecting embedded bytes to process

213

*/

214

public interface EmbeddedBytesSelector {

215

/**

216

* Determines if embedded bytes should be selected

217

* @param metadata metadata for embedded content

218

* @return true if bytes should be processed

219

*/

220

boolean select(Metadata metadata);

221

}

222

223

/**

224

* Basic implementation of embedded bytes selector

225

*/

226

public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector {

227

/**

228

* Creates basic embedded bytes selector

229

*/

230

public BasicEmbeddedBytesSelector();

231

}

232

```

233

234

### Stream Translation

235

236

Interfaces for translating embedded streams during extraction.

237

238

```java { .api }

239

/**

240

* Interface for translating embedded streams

241

*/

242

public interface EmbeddedStreamTranslator {

243

/**

244

* Translates embedded stream content

245

* @param is input stream to translate

246

* @param embeddedMetadata metadata for embedded content

247

* @return translated input stream

248

* @throws IOException if translation fails

249

*/

250

InputStream translate(InputStream is, Metadata embeddedMetadata) throws IOException;

251

}

252

253

/**

254

* Default implementation of embedded stream translator

255

*/

256

public class DefaultEmbeddedStreamTranslator implements EmbeddedStreamTranslator {

257

/**

258

* Creates default stream translator

259

*/

260

public DefaultEmbeddedStreamTranslator();

261

}

262

```

263

264

### Utility Classes

265

266

Helper classes for embedded document processing.

267

268

```java { .api }

269

/**

270

* Utility methods for embedded document processing

271

*/

272

public class EmbeddedDocumentUtil {

273

/**

274

* Gets file extension from metadata

275

* @param metadata document metadata

276

* @return file extension or null

277

*/

278

public static String getExtension(Metadata metadata);

279

280

/**

281

* Tries to determine file extension from content type

282

* @param metadata document metadata containing content type

283

* @return likely file extension or null

284

*/

285

public static String tryToGetExtensionFromContentType(Metadata metadata);

286

}

287

```

288

289

## Usage Examples

290

291

**Basic Embedded Document Extraction:**

292

293

```java

294

import org.apache.tika.extractor.*;

295

import org.apache.tika.parser.*;

296

import org.apache.tika.metadata.Metadata;

297

import org.apache.tika.sax.BodyContentHandler;

298

import java.io.FileInputStream;

299

import java.io.InputStream;

300

301

// Setup parsing context with embedded extractor

302

ParseContext context = new ParseContext();

303

EmbeddedDocumentExtractor extractor = new ParsingEmbeddedDocumentExtractor(context);

304

context.set(EmbeddedDocumentExtractor.class, extractor);

305

306

// Parse document with embedded content

307

Parser parser = new AutoDetectParser();

308

BodyContentHandler handler = new BodyContentHandler();

309

Metadata metadata = new Metadata();

310

311

try (InputStream stream = new FileInputStream("compound_document.docx")) {

312

parser.parse(stream, handler, metadata, context);

313

314

// Extracted content includes embedded documents

315

String content = handler.toString();

316

System.out.println("Content with embedded documents: " + content);

317

}

318

```

319

320

**Container Extraction with Custom Handler:**

321

322

```java

323

import org.apache.tika.extractor.*;

324

import org.apache.tika.io.TikaInputStream;

325

import org.apache.tika.mime.MediaType;

326

import java.io.FileInputStream;

327

import java.io.IOException;

328

import java.util.ArrayList;

329

import java.util.List;

330

331

// Custom resource handler to collect extracted files

332

class ResourceCollector implements EmbeddedResourceHandler {

333

private List<ExtractedResource> resources = new ArrayList<>();

334

335

@Override

336

public void handle(String filename, MediaType mediaType, InputStream stream) {

337

try {

338

byte[] content = stream.readAllBytes();

339

resources.add(new ExtractedResource(filename, mediaType, content));

340

System.out.println("Extracted: " + filename + " (" + mediaType + ")");

341

} catch (IOException e) {

342

System.err.println("Failed to read: " + filename);

343

}

344

}

345

346

public List<ExtractedResource> getResources() { return resources; }

347

}

348

349

// Extract from ZIP container

350

ContainerExtractor extractor = new ParserContainerExtractor();

351

ResourceCollector collector = new ResourceCollector();

352

353

try (TikaInputStream stream = TikaInputStream.get(new FileInputStream("archive.zip"))) {

354

if (extractor.isSupported(stream)) {

355

extractor.extract(stream, extractor, collector);

356

357

// Process extracted resources

358

for (ExtractedResource resource : collector.getResources()) {

359

System.out.println("Found: " + resource.getFilename() +

360

" (" + resource.getContent().length + " bytes)");

361

}

362

}

363

}

364

```

365

366

**Selective Embedded Document Processing:**

367

368

```java

369

// Custom document selector for specific file types

370

class PDFSelector implements DocumentSelector {

371

@Override

372

public boolean select(Metadata metadata) {

373

String contentType = metadata.get(TikaCoreProperties.TYPE);

374

return "application/pdf".equals(contentType);

375

}

376

}

377

378

// Configure selective extraction

379

ParseContext context = new ParseContext();

380

context.set(DocumentSelector.class, new PDFSelector());

381

382

EmbeddedDocumentExtractor extractor = new ParsingEmbeddedDocumentExtractor(context);

383

context.set(EmbeddedDocumentExtractor.class, extractor);

384

385

// Only PDF embedded documents will be processed

386

Parser parser = new AutoDetectParser();

387

// ... continue with parsing

388

```

389

390

**Embedded Document Bytes Handling:**

391

392

```java

393

import org.apache.tika.extractor.*;

394

import org.apache.tika.io.TemporaryResources;

395

396

// Custom bytes handler for processing embedded document bytes

397

class CustomBytesHandler extends AbstractEmbeddedDocumentBytesHandler {

398

public CustomBytesHandler(TemporaryResources temp) {

399

super(temp);

400

}

401

402

@Override

403

public void handleEmbeddedDocumentBytes(byte[] bytes, Metadata metadata)

404

throws IOException, TikaException {

405

String filename = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);

406

System.out.println("Processing embedded bytes for: " + filename +

407

" (" + bytes.length + " bytes)");

408

409

// Custom processing logic for embedded bytes

410

// e.g., save to file, analyze content, etc.

411

}

412

}

413

414

// Use custom bytes handler in parsing context

415

TemporaryResources temp = new TemporaryResources();

416

EmbeddedDocumentBytesHandler bytesHandler = new CustomBytesHandler(temp);

417

ParseContext context = new ParseContext();

418

context.set(EmbeddedDocumentBytesHandler.class, bytesHandler);

419

```

420

421

The embedded extraction framework provides comprehensive support for handling compound documents, from high-level automatic extraction to low-level container processing with custom handlers and selective processing strategies.