or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcontent-processing.mddetection.mdembedded-extraction.mdembedding.mdexceptions.mdindex.mdio-utilities.mdlanguage.mdmetadata.mdmime-types.mdparsing.mdpipes.mdprocess-forking.mdrendering.md

configuration.mddocs/

0

# Configuration

1

2

Configuration system for managing Tika parsers, detectors, and service loading with XML-based configuration files, parameter management, and service discovery mechanisms.

3

4

## Capabilities

5

6

### TikaConfig Class

7

8

Central configuration class that manages parser, detector, and translator configurations with support for custom configurations and service loading.

9

10

```java { .api }

11

/**

12

* Main configuration class for Tika components and services

13

*/

14

public class TikaConfig {

15

/**

16

* Gets the default Tika configuration with standard parsers and detectors

17

* @return TikaConfig instance with default settings

18

*/

19

public static TikaConfig getDefaultConfig();

20

21

/**

22

* Creates TikaConfig from XML configuration file

23

* @param file XML configuration file

24

* @return TikaConfig instance based on file configuration

25

* @throws TikaException if configuration is invalid

26

* @throws IOException if file cannot be read

27

*/

28

public TikaConfig(File file) throws TikaException, IOException;

29

30

/**

31

* Creates TikaConfig from XML configuration stream

32

* @param stream InputStream containing XML configuration

33

* @return TikaConfig instance based on stream configuration

34

* @throws TikaException if configuration is invalid

35

* @throws IOException if stream cannot be read

36

*/

37

public TikaConfig(InputStream stream) throws TikaException, IOException;

38

39

/**

40

* Creates TikaConfig from XML configuration at URL

41

* @param url URL pointing to XML configuration

42

* @throws TikaException if configuration is invalid

43

* @throws IOException if URL cannot be accessed

44

*/

45

public TikaConfig(URL url) throws TikaException, IOException;

46

47

/**

48

* Creates TikaConfig from classpath resource

49

* @param resource Resource path in classpath

50

* @throws TikaException if configuration is invalid

51

*/

52

public TikaConfig(String resource) throws TikaException;

53

54

/**

55

* Creates TikaConfig with custom class loader

56

* @param loader ClassLoader for service discovery

57

*/

58

public TikaConfig(ClassLoader loader);

59

60

/**

61

* Gets the configured composite parser

62

* @return Parser instance configured with all registered parsers

63

*/

64

public Parser getParser();

65

66

/**

67

* Gets parser for specific media type

68

* @param mimeType MediaType to get parser for

69

* @return Parser that handles the specified media type

70

*/

71

public Parser getParser(MediaType mimeType);

72

73

/**

74

* Gets all configured parsers mapped by media type

75

* @return Map of MediaType to Parser instances

76

*/

77

public Map<MediaType, Parser> getParsers();

78

79

/**

80

* Gets the configured composite detector

81

* @return Detector instance configured with all registered detectors

82

*/

83

public Detector getDetector();

84

85

/**

86

* Gets the configured translator

87

* @return Translator instance for text translation

88

*/

89

public Translator getTranslator();

90

91

/**

92

* Gets the MIME types registry

93

* @return MimeTypes instance with registered type definitions

94

*/

95

public MimeTypes getMimeRepository();

96

97

/**

98

* Gets the media type registry for type relationships

99

* @return MediaTypeRegistry for managing type hierarchies

100

*/

101

public MediaTypeRegistry getMediaTypeRegistry();

102

103

/**

104

* Gets configuration for specific parser class

105

* @param parserClass Class of parser to get configuration for

106

* @return Map of configuration parameters for the parser

107

*/

108

public Map<String, Param> getParserConfig(Class<? extends Parser> parserClass);

109

110

/**

111

* Gets configuration for specific detector class

112

* @param detectorClass Class of detector to get configuration for

113

* @return Map of configuration parameters for the detector

114

*/

115

public Map<String, Param> getDetectorConfig(Class<? extends Detector> detectorClass);

116

117

/**

118

* Gets the service loader configuration

119

* @return ServiceLoader instance used for dynamic service discovery

120

*/

121

public ServiceLoader getServiceLoader();

122

}

123

```

124

125

### ServiceLoader Class

126

127

Service loading utility for dynamic discovery and instantiation of Tika components.

128

129

```java { .api }

130

/**

131

* Service loader for dynamic discovery of Tika components

132

*/

133

public class ServiceLoader {

134

/**

135

* Creates ServiceLoader with default class loader

136

*/

137

public ServiceLoader();

138

139

/**

140

* Creates ServiceLoader with custom class loader

141

* @param loader ClassLoader to use for service discovery

142

*/

143

public ServiceLoader(ClassLoader loader);

144

145

/**

146

* Creates ServiceLoader with class loader and dynamic loading flag

147

* @param loader ClassLoader for service discovery

148

* @param dynamic Whether to enable dynamic loading

149

*/

150

public ServiceLoader(ClassLoader loader, boolean dynamic);

151

152

/**

153

* Loads all available services of specified type

154

* @param iface Interface or class type to load

155

* @return List of service instances implementing the interface

156

*/

157

public <T> List<T> loadServiceProviders(Class<T> iface);

158

159

/**

160

* Loads static services from META-INF/services files

161

* @param iface Interface or class type to load

162

* @return List of statically declared service instances

163

*/

164

public <T> List<T> loadStaticServiceProviders(Class<T> iface);

165

166

/**

167

* Loads dynamic services from configuration

168

* @param iface Interface or class type to load

169

* @return List of dynamically configured service instances

170

*/

171

public <T> List<T> loadDynamicServiceProviders(Class<T> iface);

172

173

/**

174

* Gets the class loader used by this service loader

175

* @return ClassLoader instance used for loading services

176

*/

177

public ClassLoader getLoader();

178

179

/**

180

* Checks if dynamic loading is enabled

181

* @return true if dynamic loading is enabled

182

*/

183

public boolean isDynamic();

184

}

185

```

186

187

### Configuration Parameters

188

189

#### Param Class

190

191

Represents a configuration parameter with name, value, and type information.

192

193

```java { .api }

194

/**

195

* Configuration parameter with name, value, and type information

196

*/

197

public class Param<T> {

198

/**

199

* Creates Param with name and value

200

* @param name Parameter name

201

* @param value Parameter value

202

*/

203

public Param(String name, T value);

204

205

/**

206

* Creates Param with name, value, and type

207

* @param name Parameter name

208

* @param value Parameter value

209

* @param type Parameter type class

210

*/

211

public Param(String name, T value, Class<T> type);

212

213

/**

214

* Gets parameter name

215

* @return String containing parameter name

216

*/

217

public String getName();

218

219

/**

220

* Gets parameter value

221

* @return Parameter value of type T

222

*/

223

public T getValue();

224

225

/**

226

* Gets parameter type

227

* @return Class representing parameter type

228

*/

229

public Class<T> getType();

230

231

/**

232

* Sets parameter value

233

* @param value New parameter value

234

*/

235

public void setValue(T value);

236

237

/**

238

* Gets string representation of value

239

* @return String representation of parameter value

240

*/

241

@Override

242

public String toString();

243

}

244

```

245

246

#### ParamField Class

247

248

Descriptor for parameter fields with metadata about configuration parameters.

249

250

```java { .api }

251

/**

252

* Field descriptor for configuration parameters with metadata

253

*/

254

public class ParamField {

255

/**

256

* Creates ParamField for specified field

257

* @param field Field to create descriptor for

258

*/

259

public ParamField(Field field);

260

261

/**

262

* Gets the field name

263

* @return String containing field name

264

*/

265

public String getName();

266

267

/**

268

* Gets the field type

269

* @return Class representing field type

270

*/

271

public Class<?> getType();

272

273

/**

274

* Checks if field is required

275

* @return true if field is required for configuration

276

*/

277

public boolean isRequired();

278

279

/**

280

* Gets default value for field

281

* @return Default value or null if no default

282

*/

283

public Object getDefaultValue();

284

285

/**

286

* Gets field description from annotations

287

* @return String describing field purpose

288

*/

289

public String getDescription();

290

291

/**

292

* Sets field value on target object

293

* @param target Object to set field value on

294

* @param value Value to set

295

* @throws IllegalAccessException if field is not accessible

296

*/

297

public void setValue(Object target, Object value) throws IllegalAccessException;

298

299

/**

300

* Gets field value from target object

301

* @param target Object to get field value from

302

* @return Field value

303

* @throws IllegalAccessException if field is not accessible

304

*/

305

public Object getValue(Object target) throws IllegalAccessException;

306

}

307

```

308

309

### Configuration Base Classes

310

311

#### ConfigBase Class

312

313

Base class for configurable Tika components with parameter injection support.

314

315

```java { .api }

316

/**

317

* Base class for configurable components with parameter injection

318

*/

319

public abstract class ConfigBase {

320

/**

321

* Initializes component with configuration parameters

322

* @param params Map of parameter names to Param objects

323

* @throws TikaConfigException if initialization fails

324

*/

325

public void initialize(Map<String, Param> params) throws TikaConfigException;

326

327

/**

328

* Checks current configuration state

329

* @param handler Problem handler for reporting issues

330

*/

331

public void checkInitialization(InitializableProblemHandler handler);

332

333

/**

334

* Gets all configurable fields for this component

335

* @return List of ParamField descriptors for configurable fields

336

*/

337

public List<ParamField> getConfigurableFields();

338

339

/**

340

* Gets configuration parameter by name

341

* @param name Parameter name

342

* @return Param object or null if not found

343

*/

344

protected Param getParam(String name);

345

346

/**

347

* Sets configuration parameter

348

* @param name Parameter name

349

* @param value Parameter value

350

*/

351

protected void setParam(String name, Object value);

352

353

/**

354

* Validates configuration parameters

355

* @throws TikaConfigException if validation fails

356

*/

357

protected void validateConfig() throws TikaConfigException;

358

}

359

```

360

361

### Problem Handling

362

363

#### InitializableProblemHandler Interface

364

365

Interface for handling problems that occur during component initialization.

366

367

```java { .api }

368

/**

369

* Handler for problems encountered during component initialization

370

*/

371

public interface InitializableProblemHandler {

372

/**

373

* Handles a problem encountered during initialization

374

* @param clazz Class where problem occurred

375

* @param problem Description of the problem

376

*/

377

void handleInitializableProblem(Class<?> clazz, String problem);

378

}

379

```

380

381

#### ParsingProblemHandler Implementation

382

383

Default implementation that collects initialization problems for later analysis.

384

385

```java { .api }

386

/**

387

* Default problem handler that collects initialization issues

388

*/

389

public class ParsingProblemHandler implements InitializableProblemHandler {

390

/**

391

* Creates problem handler for collecting issues

392

*/

393

public ParsingProblemHandler();

394

395

/**

396

* Handles initialization problem by recording it

397

* @param clazz Class where problem occurred

398

* @param problem Description of the problem

399

*/

400

@Override

401

public void handleInitializableProblem(Class<?> clazz, String problem);

402

403

/**

404

* Gets all recorded problems

405

* @return List of problems encountered during initialization

406

*/

407

public List<String> getProblems();

408

409

/**

410

* Checks if any problems were recorded

411

* @return true if problems were encountered

412

*/

413

public boolean hasProblems();

414

415

/**

416

* Gets problems for specific class

417

* @param clazz Class to get problems for

418

* @return List of problems for the specified class

419

*/

420

public List<String> getProblems(Class<?> clazz);

421

}

422

```

423

424

## Configuration File Format

425

426

### XML Configuration Structure

427

428

```xml { .api }

429

<?xml version="1.0" encoding="UTF-8"?>

430

<properties>

431

<!-- MIME Types Configuration -->

432

<mimeTypeRepository resource="custom-mimetypes.xml"/>

433

434

<!-- Detectors Configuration -->

435

<detectors>

436

<detector class="org.apache.tika.detect.DefaultDetector"/>

437

<detector class="org.example.CustomDetector">

438

<params>

439

<param name="threshold" type="int">90</param>

440

<param name="enabled" type="boolean">true</param>

441

</params>

442

</detector>

443

</detectors>

444

445

<!-- Parsers Configuration -->

446

<parsers>

447

<parser class="org.apache.tika.parser.AutoDetectParser"/>

448

<parser class="org.apache.tika.parser.pdf.PDFParser">

449

<params>

450

<param name="extractInlineImages" type="boolean">false</param>

451

<param name="sortByPosition" type="boolean">true</param>

452

</params>

453

</parser>

454

</parsers>

455

456

<!-- Translator Configuration -->

457

<translator class="org.apache.tika.language.translate.DefaultTranslator">

458

<params>

459

<param name="maxStringLength" type="int">10000</param>

460

</params>

461

</translator>

462

463

<!-- Service Loader Configuration -->

464

<service-loader dynamic="true" loadErrorHandler="IGNORE"/>

465

</properties>

466

```

467

468

## Usage Examples

469

470

### Basic Configuration Usage

471

472

```java { .api }

473

// Use default configuration

474

TikaConfig config = TikaConfig.getDefaultConfig();

475

Parser parser = config.getParser();

476

Detector detector = config.getDetector();

477

478

// Parse with configured components

479

Metadata metadata = new Metadata();

480

try (InputStream input = new FileInputStream("document.pdf")) {

481

parser.parse(input, new BodyContentHandler(), metadata, new ParseContext());

482

}

483

```

484

485

### Custom Configuration Loading

486

487

```java { .api }

488

// Load configuration from file

489

try {

490

TikaConfig config = new TikaConfig("tika-config.xml");

491

492

// Get configured components

493

Parser parser = config.getParser();

494

Detector detector = config.getDetector();

495

Translator translator = config.getTranslator();

496

497

} catch (TikaException | IOException e) {

498

System.err.println("Configuration error: " + e.getMessage());

499

}

500

501

// Load from classpath resource

502

TikaConfig config = new TikaConfig("/org/example/custom-tika.xml");

503

```

504

505

### Working with Service Loader

506

507

```java { .api }

508

// Create service loader with custom class loader

509

ClassLoader customLoader = Thread.currentThread().getContextClassLoader();

510

ServiceLoader serviceLoader = new ServiceLoader(customLoader, true);

511

512

// Load parser services

513

List<Parser> parsers = serviceLoader.loadServiceProviders(Parser.class);

514

System.out.println("Found " + parsers.size() + " parser services");

515

516

// Load detector services

517

List<Detector> detectors = serviceLoader.loadServiceProviders(Detector.class);

518

for (Detector detector : detectors) {

519

System.out.println("Detector: " + detector.getClass().getName());

520

}

521

```

522

523

### Parameter Configuration

524

525

```java { .api }

526

// Get parser configuration

527

TikaConfig config = TikaConfig.getDefaultConfig();

528

Map<String, Param> pdfConfig = config.getParserConfig(PDFParser.class);

529

530

// Check specific parameter

531

Param extractImages = pdfConfig.get("extractInlineImages");

532

if (extractImages != null) {

533

System.out.println("Extract images: " + extractImages.getValue());

534

}

535

536

// Create custom parameters

537

Map<String, Param> customParams = new HashMap<>();

538

customParams.put("maxStringLength", new Param<>("maxStringLength", 100000, Integer.class));

539

customParams.put("enableOCR", new Param<>("enableOCR", true, Boolean.class));

540

```

541

542

### Configurable Component Implementation

543

544

```java { .api }

545

public class CustomParser extends ConfigBase implements Parser {

546

private int maxDocuments = 1000;

547

private boolean verbose = false;

548

private String outputFormat = "text";

549

550

@Override

551

public void initialize(Map<String, Param> params) throws TikaConfigException {

552

super.initialize(params);

553

554

Param maxDocs = getParam("maxDocuments");

555

if (maxDocs != null) {

556

this.maxDocuments = (Integer) maxDocs.getValue();

557

}

558

559

Param verboseParam = getParam("verbose");

560

if (verboseParam != null) {

561

this.verbose = (Boolean) verboseParam.getValue();

562

}

563

564

validateConfig();

565

}

566

567

@Override

568

protected void validateConfig() throws TikaConfigException {

569

if (maxDocuments <= 0) {

570

throw new TikaConfigException("maxDocuments must be positive");

571

}

572

}

573

574

@Override

575

public void parse(InputStream stream, ContentHandler handler,

576

Metadata metadata, ParseContext context)

577

throws IOException, SAXException, TikaException {

578

// Implementation using configured parameters

579

if (verbose) {

580

System.out.println("Parsing with maxDocuments=" + maxDocuments);

581

}

582

}

583

584

@Override

585

public Set<MediaType> getSupportedTypes(ParseContext context) {

586

return Collections.singleton(MediaType.TEXT_PLAIN);

587

}

588

}

589

```

590

591

### Problem Handling

592

593

```java { .api }

594

// Handle initialization problems

595

ParsingProblemHandler problemHandler = new ParsingProblemHandler();

596

597

try {

598

TikaConfig config = new TikaConfig("config-with-issues.xml");

599

600

// Check for initialization problems

601

config.getParser(); // This might trigger initialization

602

603

if (problemHandler.hasProblems()) {

604

for (String problem : problemHandler.getProblems()) {

605

System.err.println("Configuration issue: " + problem);

606

}

607

}

608

609

} catch (TikaException e) {

610

System.err.println("Fatal configuration error: " + e.getMessage());

611

}

612

```