0
# Configuration
1
2
Configuration system for managing Tika parsers, detectors, and service loading with XML-based configuration files, parameter management, and service discovery mechanisms.
3
4
## Capabilities
5
6
### TikaConfig Class
7
8
Central configuration class that manages parser, detector, and translator configurations with support for custom configurations and service loading.
9
10
```java { .api }
11
/**
12
* Main configuration class for Tika components and services
13
*/
14
public class TikaConfig {
15
/**
16
* Gets the default Tika configuration with standard parsers and detectors
17
* @return TikaConfig instance with default settings
18
*/
19
public static TikaConfig getDefaultConfig();
20
21
/**
22
* Creates TikaConfig from XML configuration file
23
* @param file XML configuration file
24
* @return TikaConfig instance based on file configuration
25
* @throws TikaException if configuration is invalid
26
* @throws IOException if file cannot be read
27
*/
28
public TikaConfig(File file) throws TikaException, IOException;
29
30
/**
31
* Creates TikaConfig from XML configuration stream
32
* @param stream InputStream containing XML configuration
33
* @return TikaConfig instance based on stream configuration
34
* @throws TikaException if configuration is invalid
35
* @throws IOException if stream cannot be read
36
*/
37
public TikaConfig(InputStream stream) throws TikaException, IOException;
38
39
/**
40
* Creates TikaConfig from XML configuration at URL
41
* @param url URL pointing to XML configuration
42
* @throws TikaException if configuration is invalid
43
* @throws IOException if URL cannot be accessed
44
*/
45
public TikaConfig(URL url) throws TikaException, IOException;
46
47
/**
48
* Creates TikaConfig from classpath resource
49
* @param resource Resource path in classpath
50
* @throws TikaException if configuration is invalid
51
*/
52
public TikaConfig(String resource) throws TikaException;
53
54
/**
55
* Creates TikaConfig with custom class loader
56
* @param loader ClassLoader for service discovery
57
*/
58
public TikaConfig(ClassLoader loader);
59
60
/**
61
* Gets the configured composite parser
62
* @return Parser instance configured with all registered parsers
63
*/
64
public Parser getParser();
65
66
/**
67
* Gets parser for specific media type
68
* @param mimeType MediaType to get parser for
69
* @return Parser that handles the specified media type
70
*/
71
public Parser getParser(MediaType mimeType);
72
73
/**
74
* Gets all configured parsers mapped by media type
75
* @return Map of MediaType to Parser instances
76
*/
77
public Map<MediaType, Parser> getParsers();
78
79
/**
80
* Gets the configured composite detector
81
* @return Detector instance configured with all registered detectors
82
*/
83
public Detector getDetector();
84
85
/**
86
* Gets the configured translator
87
* @return Translator instance for text translation
88
*/
89
public Translator getTranslator();
90
91
/**
92
* Gets the MIME types registry
93
* @return MimeTypes instance with registered type definitions
94
*/
95
public MimeTypes getMimeRepository();
96
97
/**
98
* Gets the media type registry for type relationships
99
* @return MediaTypeRegistry for managing type hierarchies
100
*/
101
public MediaTypeRegistry getMediaTypeRegistry();
102
103
/**
104
* Gets configuration for specific parser class
105
* @param parserClass Class of parser to get configuration for
106
* @return Map of configuration parameters for the parser
107
*/
108
public Map<String, Param> getParserConfig(Class<? extends Parser> parserClass);
109
110
/**
111
* Gets configuration for specific detector class
112
* @param detectorClass Class of detector to get configuration for
113
* @return Map of configuration parameters for the detector
114
*/
115
public Map<String, Param> getDetectorConfig(Class<? extends Detector> detectorClass);
116
117
/**
118
* Gets the service loader configuration
119
* @return ServiceLoader instance used for dynamic service discovery
120
*/
121
public ServiceLoader getServiceLoader();
122
}
123
```
124
125
### ServiceLoader Class
126
127
Service loading utility for dynamic discovery and instantiation of Tika components.
128
129
```java { .api }
130
/**
131
* Service loader for dynamic discovery of Tika components
132
*/
133
public class ServiceLoader {
134
/**
135
* Creates ServiceLoader with default class loader
136
*/
137
public ServiceLoader();
138
139
/**
140
* Creates ServiceLoader with custom class loader
141
* @param loader ClassLoader to use for service discovery
142
*/
143
public ServiceLoader(ClassLoader loader);
144
145
/**
146
* Creates ServiceLoader with class loader and dynamic loading flag
147
* @param loader ClassLoader for service discovery
148
* @param dynamic Whether to enable dynamic loading
149
*/
150
public ServiceLoader(ClassLoader loader, boolean dynamic);
151
152
/**
153
* Loads all available services of specified type
154
* @param iface Interface or class type to load
155
* @return List of service instances implementing the interface
156
*/
157
public <T> List<T> loadServiceProviders(Class<T> iface);
158
159
/**
160
* Loads static services from META-INF/services files
161
* @param iface Interface or class type to load
162
* @return List of statically declared service instances
163
*/
164
public <T> List<T> loadStaticServiceProviders(Class<T> iface);
165
166
/**
167
* Loads dynamic services from configuration
168
* @param iface Interface or class type to load
169
* @return List of dynamically configured service instances
170
*/
171
public <T> List<T> loadDynamicServiceProviders(Class<T> iface);
172
173
/**
174
* Gets the class loader used by this service loader
175
* @return ClassLoader instance used for loading services
176
*/
177
public ClassLoader getLoader();
178
179
/**
180
* Checks if dynamic loading is enabled
181
* @return true if dynamic loading is enabled
182
*/
183
public boolean isDynamic();
184
}
185
```
186
187
### Configuration Parameters
188
189
#### Param Class
190
191
Represents a configuration parameter with name, value, and type information.
192
193
```java { .api }
194
/**
195
* Configuration parameter with name, value, and type information
196
*/
197
public class Param<T> {
198
/**
199
* Creates Param with name and value
200
* @param name Parameter name
201
* @param value Parameter value
202
*/
203
public Param(String name, T value);
204
205
/**
206
* Creates Param with name, value, and type
207
* @param name Parameter name
208
* @param value Parameter value
209
* @param type Parameter type class
210
*/
211
public Param(String name, T value, Class<T> type);
212
213
/**
214
* Gets parameter name
215
* @return String containing parameter name
216
*/
217
public String getName();
218
219
/**
220
* Gets parameter value
221
* @return Parameter value of type T
222
*/
223
public T getValue();
224
225
/**
226
* Gets parameter type
227
* @return Class representing parameter type
228
*/
229
public Class<T> getType();
230
231
/**
232
* Sets parameter value
233
* @param value New parameter value
234
*/
235
public void setValue(T value);
236
237
/**
238
* Gets string representation of value
239
* @return String representation of parameter value
240
*/
241
@Override
242
public String toString();
243
}
244
```
245
246
#### ParamField Class
247
248
Descriptor for parameter fields with metadata about configuration parameters.
249
250
```java { .api }
251
/**
252
* Field descriptor for configuration parameters with metadata
253
*/
254
public class ParamField {
255
/**
256
* Creates ParamField for specified field
257
* @param field Field to create descriptor for
258
*/
259
public ParamField(Field field);
260
261
/**
262
* Gets the field name
263
* @return String containing field name
264
*/
265
public String getName();
266
267
/**
268
* Gets the field type
269
* @return Class representing field type
270
*/
271
public Class<?> getType();
272
273
/**
274
* Checks if field is required
275
* @return true if field is required for configuration
276
*/
277
public boolean isRequired();
278
279
/**
280
* Gets default value for field
281
* @return Default value or null if no default
282
*/
283
public Object getDefaultValue();
284
285
/**
286
* Gets field description from annotations
287
* @return String describing field purpose
288
*/
289
public String getDescription();
290
291
/**
292
* Sets field value on target object
293
* @param target Object to set field value on
294
* @param value Value to set
295
* @throws IllegalAccessException if field is not accessible
296
*/
297
public void setValue(Object target, Object value) throws IllegalAccessException;
298
299
/**
300
* Gets field value from target object
301
* @param target Object to get field value from
302
* @return Field value
303
* @throws IllegalAccessException if field is not accessible
304
*/
305
public Object getValue(Object target) throws IllegalAccessException;
306
}
307
```
308
309
### Configuration Base Classes
310
311
#### ConfigBase Class
312
313
Base class for configurable Tika components with parameter injection support.
314
315
```java { .api }
316
/**
317
* Base class for configurable components with parameter injection
318
*/
319
public abstract class ConfigBase {
320
/**
321
* Initializes component with configuration parameters
322
* @param params Map of parameter names to Param objects
323
* @throws TikaConfigException if initialization fails
324
*/
325
public void initialize(Map<String, Param> params) throws TikaConfigException;
326
327
/**
328
* Checks current configuration state
329
* @param handler Problem handler for reporting issues
330
*/
331
public void checkInitialization(InitializableProblemHandler handler);
332
333
/**
334
* Gets all configurable fields for this component
335
* @return List of ParamField descriptors for configurable fields
336
*/
337
public List<ParamField> getConfigurableFields();
338
339
/**
340
* Gets configuration parameter by name
341
* @param name Parameter name
342
* @return Param object or null if not found
343
*/
344
protected Param getParam(String name);
345
346
/**
347
* Sets configuration parameter
348
* @param name Parameter name
349
* @param value Parameter value
350
*/
351
protected void setParam(String name, Object value);
352
353
/**
354
* Validates configuration parameters
355
* @throws TikaConfigException if validation fails
356
*/
357
protected void validateConfig() throws TikaConfigException;
358
}
359
```
360
361
### Problem Handling
362
363
#### InitializableProblemHandler Interface
364
365
Interface for handling problems that occur during component initialization.
366
367
```java { .api }
368
/**
369
* Handler for problems encountered during component initialization
370
*/
371
public interface InitializableProblemHandler {
372
/**
373
* Handles a problem encountered during initialization
374
* @param clazz Class where problem occurred
375
* @param problem Description of the problem
376
*/
377
void handleInitializableProblem(Class<?> clazz, String problem);
378
}
379
```
380
381
#### ParsingProblemHandler Implementation
382
383
Default implementation that collects initialization problems for later analysis.
384
385
```java { .api }
386
/**
387
* Default problem handler that collects initialization issues
388
*/
389
public class ParsingProblemHandler implements InitializableProblemHandler {
390
/**
391
* Creates problem handler for collecting issues
392
*/
393
public ParsingProblemHandler();
394
395
/**
396
* Handles initialization problem by recording it
397
* @param clazz Class where problem occurred
398
* @param problem Description of the problem
399
*/
400
@Override
401
public void handleInitializableProblem(Class<?> clazz, String problem);
402
403
/**
404
* Gets all recorded problems
405
* @return List of problems encountered during initialization
406
*/
407
public List<String> getProblems();
408
409
/**
410
* Checks if any problems were recorded
411
* @return true if problems were encountered
412
*/
413
public boolean hasProblems();
414
415
/**
416
* Gets problems for specific class
417
* @param clazz Class to get problems for
418
* @return List of problems for the specified class
419
*/
420
public List<String> getProblems(Class<?> clazz);
421
}
422
```
423
424
## Configuration File Format
425
426
### XML Configuration Structure
427
428
```xml { .api }
429
<?xml version="1.0" encoding="UTF-8"?>
430
<properties>
431
<!-- MIME Types Configuration -->
432
<mimeTypeRepository resource="custom-mimetypes.xml"/>
433
434
<!-- Detectors Configuration -->
435
<detectors>
436
<detector class="org.apache.tika.detect.DefaultDetector"/>
437
<detector class="org.example.CustomDetector">
438
<params>
439
<param name="threshold" type="int">90</param>
440
<param name="enabled" type="boolean">true</param>
441
</params>
442
</detector>
443
</detectors>
444
445
<!-- Parsers Configuration -->
446
<parsers>
447
<parser class="org.apache.tika.parser.AutoDetectParser"/>
448
<parser class="org.apache.tika.parser.pdf.PDFParser">
449
<params>
450
<param name="extractInlineImages" type="boolean">false</param>
451
<param name="sortByPosition" type="boolean">true</param>
452
</params>
453
</parser>
454
</parsers>
455
456
<!-- Translator Configuration -->
457
<translator class="org.apache.tika.language.translate.DefaultTranslator">
458
<params>
459
<param name="maxStringLength" type="int">10000</param>
460
</params>
461
</translator>
462
463
<!-- Service Loader Configuration -->
464
<service-loader dynamic="true" loadErrorHandler="IGNORE"/>
465
</properties>
466
```
467
468
## Usage Examples
469
470
### Basic Configuration Usage
471
472
```java { .api }
473
// Use default configuration
474
TikaConfig config = TikaConfig.getDefaultConfig();
475
Parser parser = config.getParser();
476
Detector detector = config.getDetector();
477
478
// Parse with configured components
479
Metadata metadata = new Metadata();
480
try (InputStream input = new FileInputStream("document.pdf")) {
481
parser.parse(input, new BodyContentHandler(), metadata, new ParseContext());
482
}
483
```
484
485
### Custom Configuration Loading
486
487
```java { .api }
488
// Load configuration from file
489
try {
490
TikaConfig config = new TikaConfig("tika-config.xml");
491
492
// Get configured components
493
Parser parser = config.getParser();
494
Detector detector = config.getDetector();
495
Translator translator = config.getTranslator();
496
497
} catch (TikaException | IOException e) {
498
System.err.println("Configuration error: " + e.getMessage());
499
}
500
501
// Load from classpath resource
502
TikaConfig config = new TikaConfig("/org/example/custom-tika.xml");
503
```
504
505
### Working with Service Loader
506
507
```java { .api }
508
// Create service loader with custom class loader
509
ClassLoader customLoader = Thread.currentThread().getContextClassLoader();
510
ServiceLoader serviceLoader = new ServiceLoader(customLoader, true);
511
512
// Load parser services
513
List<Parser> parsers = serviceLoader.loadServiceProviders(Parser.class);
514
System.out.println("Found " + parsers.size() + " parser services");
515
516
// Load detector services
517
List<Detector> detectors = serviceLoader.loadServiceProviders(Detector.class);
518
for (Detector detector : detectors) {
519
System.out.println("Detector: " + detector.getClass().getName());
520
}
521
```
522
523
### Parameter Configuration
524
525
```java { .api }
526
// Get parser configuration
527
TikaConfig config = TikaConfig.getDefaultConfig();
528
Map<String, Param> pdfConfig = config.getParserConfig(PDFParser.class);
529
530
// Check specific parameter
531
Param extractImages = pdfConfig.get("extractInlineImages");
532
if (extractImages != null) {
533
System.out.println("Extract images: " + extractImages.getValue());
534
}
535
536
// Create custom parameters
537
Map<String, Param> customParams = new HashMap<>();
538
customParams.put("maxStringLength", new Param<>("maxStringLength", 100000, Integer.class));
539
customParams.put("enableOCR", new Param<>("enableOCR", true, Boolean.class));
540
```
541
542
### Configurable Component Implementation
543
544
```java { .api }
545
public class CustomParser extends ConfigBase implements Parser {
546
private int maxDocuments = 1000;
547
private boolean verbose = false;
548
private String outputFormat = "text";
549
550
@Override
551
public void initialize(Map<String, Param> params) throws TikaConfigException {
552
super.initialize(params);
553
554
Param maxDocs = getParam("maxDocuments");
555
if (maxDocs != null) {
556
this.maxDocuments = (Integer) maxDocs.getValue();
557
}
558
559
Param verboseParam = getParam("verbose");
560
if (verboseParam != null) {
561
this.verbose = (Boolean) verboseParam.getValue();
562
}
563
564
validateConfig();
565
}
566
567
@Override
568
protected void validateConfig() throws TikaConfigException {
569
if (maxDocuments <= 0) {
570
throw new TikaConfigException("maxDocuments must be positive");
571
}
572
}
573
574
@Override
575
public void parse(InputStream stream, ContentHandler handler,
576
Metadata metadata, ParseContext context)
577
throws IOException, SAXException, TikaException {
578
// Implementation using configured parameters
579
if (verbose) {
580
System.out.println("Parsing with maxDocuments=" + maxDocuments);
581
}
582
}
583
584
@Override
585
public Set<MediaType> getSupportedTypes(ParseContext context) {
586
return Collections.singleton(MediaType.TEXT_PLAIN);
587
}
588
}
589
```
590
591
### Problem Handling
592
593
```java { .api }
594
// Handle initialization problems
595
ParsingProblemHandler problemHandler = new ParsingProblemHandler();
596
597
try {
598
TikaConfig config = new TikaConfig("config-with-issues.xml");
599
600
// Check for initialization problems
601
config.getParser(); // This might trigger initialization
602
603
if (problemHandler.hasProblems()) {
604
for (String problem : problemHandler.getProblems()) {
605
System.err.println("Configuration issue: " + problem);
606
}
607
}
608
609
} catch (TikaException e) {
610
System.err.println("Fatal configuration error: " + e.getMessage());
611
}
612
```