or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcontent-processing.mddetection.mdembedded-extraction.mdembedding.mdexceptions.mdindex.mdio-utilities.mdlanguage.mdmetadata.mdmime-types.mdparsing.mdpipes.mdprocess-forking.mdrendering.md

parsing.mddocs/

0

# Document Parsing

1

2

Core document parsing functionality using the Parser interface and implementations for extracting content and metadata from various document formats with automatic format detection and flexible parsing contexts.

3

4

## Capabilities

5

6

### Parser Interface

7

8

The fundamental interface for all document parsers in Tika, defining the contract for parsing documents into structured content with metadata extraction.

9

10

```java { .api }

11

/**

12

* Interface for document parsers that extract content and metadata from input streams

13

*/

14

public interface Parser {

15

/**

16

* Parses a document from the given input stream

17

* @param stream Input stream containing the document to parse

18

* @param handler Content handler to receive parsed content events

19

* @param metadata Metadata object to populate with extracted metadata

20

* @param context Parse context containing parser configuration and state

21

* @throws IOException If an I/O error occurs during parsing

22

* @throws SAXException If a SAX parsing error occurs

23

* @throws TikaException If a Tika-specific parsing error occurs

24

*/

25

void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)

26

throws IOException, SAXException, TikaException;

27

28

/**

29

* Returns the set of media types supported by this parser

30

* @param context Parse context for configuration-dependent type support

31

* @return Set of supported MediaType objects

32

*/

33

Set<MediaType> getSupportedTypes(ParseContext context);

34

}

35

```

36

37

### AutoDetectParser

38

39

The default parser implementation that automatically detects document type and delegates to appropriate specialized parsers, providing the most convenient entry point for parsing unknown document formats.

40

41

```java { .api }

42

/**

43

* Parser that automatically detects document type and delegates to appropriate parsers

44

*/

45

public class AutoDetectParser implements Parser {

46

/**

47

* Creates an AutoDetectParser with default configuration

48

*/

49

public AutoDetectParser();

50

51

/**

52

* Creates an AutoDetectParser with the specified Tika configuration

53

* @param config TikaConfig instance containing parser and detector configuration

54

*/

55

public AutoDetectParser(TikaConfig config);

56

57

/**

58

* Sets the fallback parser used when no suitable parser is found

59

* @param fallback Parser to use as fallback, or null to disable fallback

60

*/

61

public void setFallback(Parser fallback);

62

63

/**

64

* Gets the current fallback parser

65

* @return The fallback parser, or null if no fallback is configured

66

*/

67

public Parser getFallback();

68

69

/**

70

* Gets the detector used for content type detection

71

* @return The Detector instance used by this parser

72

*/

73

public Detector getDetector();

74

75

/**

76

* Gets the map of parsers by media type

77

* @return Map from MediaType to Parser instances

78

*/

79

public Map<MediaType, Parser> getParsers();

80

}

81

```

82

83

**Usage Examples:**

84

85

```java

86

import org.apache.tika.parser.AutoDetectParser;

87

import org.apache.tika.parser.ParseContext;

88

import org.apache.tika.metadata.Metadata;

89

import org.apache.tika.sax.BodyContentHandler;

90

import java.io.FileInputStream;

91

import java.io.InputStream;

92

93

// Basic parsing with auto-detection

94

AutoDetectParser parser = new AutoDetectParser();

95

Metadata metadata = new Metadata();

96

ParseContext context = new ParseContext();

97

BodyContentHandler handler = new BodyContentHandler();

98

99

try (InputStream stream = new FileInputStream("document.pdf")) {

100

parser.parse(stream, handler, metadata, context);

101

String content = handler.toString();

102

String title = metadata.get("title");

103

String author = metadata.get("dc:creator");

104

}

105

106

// Custom configuration with fallback

107

AutoDetectParser customParser = new AutoDetectParser();

108

customParser.setFallback(new EmptyParser()); // Use empty parser as fallback

109

```

110

111

### CompositeParser

112

113

A parser that delegates parsing to a collection of sub-parsers based on media type, allowing for modular parser composition and custom parser configurations.

114

115

```java { .api }

116

/**

117

* Parser that delegates to different parsers based on media type

118

*/

119

public class CompositeParser extends AbstractParser {

120

/**

121

* Creates an empty CompositeParser

122

*/

123

public CompositeParser();

124

125

/**

126

* Creates a CompositeParser with the specified parser mappings

127

* @param parsers Map from MediaType to Parser instances

128

*/

129

public CompositeParser(Map<MediaType, Parser> parsers);

130

131

/**

132

* Gets the map of parsers by media type

133

* @return Map from MediaType to Parser instances

134

*/

135

public Map<MediaType, Parser> getParsers();

136

137

/**

138

* Sets the parser mappings

139

* @param parsers Map from MediaType to Parser instances

140

*/

141

public void setParsers(Map<MediaType, Parser> parsers);

142

143

/**

144

* Gets all media types supported by the configured parsers

145

* @param context Parse context for configuration

146

* @return Set of supported MediaType objects

147

*/

148

public Set<MediaType> getSupportedTypes(ParseContext context);

149

}

150

```

151

152

### ParseContext

153

154

Context object that carries configuration and state information during parsing operations, allowing parsers to share resources and configuration.

155

156

```java { .api }

157

/**

158

* Context object for parser configuration and state sharing

159

*/

160

public class ParseContext {

161

/**

162

* Creates an empty ParseContext

163

*/

164

public ParseContext();

165

166

/**

167

* Sets a context object of the specified type

168

* @param type Class type of the context object

169

* @param context The context object to set

170

* @param <T> Type parameter for the context object

171

*/

172

public <T> void set(Class<T> type, T context);

173

174

/**

175

* Gets a context object of the specified type

176

* @param type Class type of the context object to retrieve

177

* @param <T> Type parameter for the context object

178

* @return The context object, or null if not set

179

*/

180

public <T> T get(Class<T> type);

181

182

/**

183

* Gets a context object of the specified type with a default value

184

* @param type Class type of the context object to retrieve

185

* @param defaultValue Default value to return if not set

186

* @param <T> Type parameter for the context object

187

* @return The context object, or defaultValue if not set

188

*/

189

public <T> T get(Class<T> type, T defaultValue);

190

}

191

```

192

193

### ParsingReader

194

195

A Reader implementation that parses documents on-demand, providing character-based access to parsed content with automatic format detection.

196

197

```java { .api }

198

/**

199

* Reader that parses documents on-demand and provides character access to content

200

*/

201

public class ParsingReader extends Reader {

202

/**

203

* Creates a ParsingReader for the specified input stream

204

* @param stream Input stream containing the document to parse

205

*/

206

public ParsingReader(InputStream stream);

207

208

/**

209

* Creates a ParsingReader with custom parser and metadata

210

* @param parser Parser to use for document parsing

211

* @param stream Input stream containing the document

212

* @param metadata Metadata object to populate during parsing

213

* @param context Parse context for configuration

214

*/

215

public ParsingReader(Parser parser, InputStream stream, Metadata metadata, ParseContext context);

216

217

/**

218

* Gets the metadata populated during parsing

219

* @return Metadata object containing extracted metadata

220

*/

221

public Metadata getMetadata();

222

}

223

```

224

225

### DefaultParser

226

227

A preconfigured parser with common settings and reasonable defaults for most parsing scenarios.

228

229

```java { .api }

230

/**

231

* Parser with common configurations and reasonable defaults

232

*/

233

public class DefaultParser extends CompositeParser {

234

/**

235

* Creates a DefaultParser with standard configuration

236

*/

237

public DefaultParser();

238

239

/**

240

* Creates a DefaultParser with the specified configuration

241

* @param config TikaConfig instance for parser configuration

242

*/

243

public DefaultParser(TikaConfig config);

244

}

245

```

246

247

### AutoDetectParserConfig

248

249

Configuration class for customizing AutoDetectParser behavior with various parsing options and limits.

250

251

```java { .api }

252

/**

253

* Configuration options for AutoDetectParser

254

*/

255

public class AutoDetectParserConfig {

256

/**

257

* Creates default configuration

258

*/

259

public AutoDetectParserConfig();

260

261

/**

262

* Sets the maximum string length for text extraction

263

* @param maxStringLength Maximum length in characters

264

*/

265

public void setMaxStringLength(int maxStringLength);

266

267

/**

268

* Gets the maximum string length for text extraction

269

* @return Maximum length in characters

270

*/

271

public int getMaxStringLength();

272

}

273

```

274

275

### AbstractParser

276

277

Base class for parser implementations providing common functionality and utilities for custom parser development.

278

279

```java { .api }

280

/**

281

* Abstract base class for parser implementations

282

*/

283

public abstract class AbstractParser implements Parser {

284

/**

285

* Gets the supported types for this parser

286

* @param context Parse context for configuration

287

* @return Set of supported MediaType objects

288

*/

289

public abstract Set<MediaType> getSupportedTypes(ParseContext context);

290

291

/**

292

* Parses the document with the given parameters

293

* @param stream Input stream containing the document

294

* @param handler Content handler to receive parsed content

295

* @param metadata Metadata object to populate

296

* @param context Parse context for configuration

297

*/

298

public abstract void parse(InputStream stream, ContentHandler handler,

299

Metadata metadata, ParseContext context)

300

throws IOException, SAXException, TikaException;

301

}

302

```

303

304

## Advanced Parsing Features

305

306

### Parse Limits and Resource Management

307

308

```java { .api }

309

/**

310

* Configures parsing limits to prevent resource exhaustion

311

*/

312

public class ParseContext {

313

// Set maximum text extraction length

314

public void set(Class<WriteOutContentHandler>, new WriteOutContentHandler(100000));

315

316

// Configure memory limits for embedded document extraction

317

public void set(Class<EmbeddedDocumentExtractor>, new ParsingEmbeddedDocumentExtractor(context));

318

}

319

```

320

321

### Custom Parser Integration

322

323

```java

324

// Example of custom parser configuration

325

Map<MediaType, Parser> parsers = new HashMap<>();

326

parsers.put(MediaType.parse("application/custom"), new CustomParser());

327

328

CompositeParser compositeParser = new CompositeParser(parsers);

329

AutoDetectParser parser = new AutoDetectParser();

330

// Configure parser with custom types

331

```

332

333

## Error Handling

334

335

Common exceptions thrown during parsing operations:

336

337

- **TikaException**: General parsing errors

338

- **EncryptedDocumentException**: Document is password-protected

339

- **UnsupportedFormatException**: No parser available for document format

340

- **CorruptedFileException**: Document structure is corrupted

341

- **WriteLimitReachedException**: Content extraction limit exceeded