0
# Document Processing
1
2
Convert various file formats to Haystack Document objects and preprocess text for optimal retrieval. Supports PDF, HTML, Office documents, images, and text preprocessing operations.
3
4
## Capabilities
5
6
### PDF Processing
7
8
Extract text and content from PDF files using different parsing backends.
9
10
```python { .api }
11
class PyPDFToDocument:
12
def __init__(
13
self,
14
converter_name: str = "PyPDFToDocument",
15
extract_images: bool = False
16
) -> None:
17
"""
18
Initialize PyPDF document converter.
19
20
Args:
21
converter_name: Name identifier for the converter
22
extract_images: Whether to extract images from PDFs
23
"""
24
25
def run(
26
self,
27
sources: List[Union[str, Path, ByteStream]],
28
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
29
) -> Dict[str, List[Document]]:
30
"""
31
Convert PDF files to Document objects.
32
33
Args:
34
sources: List of file paths, Path objects, or ByteStream objects
35
meta: Optional metadata to attach to documents
36
37
Returns:
38
Dictionary with 'documents' key containing converted documents
39
"""
40
41
class PDFMinerToDocument:
42
def __init__(
43
self,
44
extract_images: bool = False,
45
laparams: Optional[Dict[str, Any]] = None
46
) -> None:
47
"""
48
Initialize PDFMiner document converter.
49
50
Args:
51
extract_images: Whether to extract images from PDFs
52
laparams: LAParams configuration for PDFMiner
53
"""
54
55
def run(
56
self,
57
sources: List[Union[str, Path, ByteStream]],
58
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
59
) -> Dict[str, List[Document]]:
60
"""Convert PDF files using PDFMiner backend."""
61
```
62
63
### Office Document Processing
64
65
Extract content from Microsoft Office documents and other office formats.
66
67
```python { .api }
68
class DOCXToDocument:
69
def __init__(self) -> None:
70
"""Initialize DOCX document converter."""
71
72
def run(
73
self,
74
sources: List[Union[str, Path, ByteStream]],
75
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
76
) -> Dict[str, List[Document]]:
77
"""
78
Convert DOCX files to Document objects.
79
80
Args:
81
sources: List of DOCX file paths or ByteStream objects
82
meta: Optional metadata to attach to documents
83
84
Returns:
85
Dictionary with 'documents' key containing converted documents
86
"""
87
88
class PPTXToDocument:
89
def __init__(self) -> None:
90
"""Initialize PPTX document converter."""
91
92
def run(
93
self,
94
sources: List[Union[str, Path, ByteStream]],
95
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
96
) -> Dict[str, List[Document]]:
97
"""Convert PowerPoint files to Document objects."""
98
99
class XLSXToDocument:
100
def __init__(
101
self,
102
table_format: Literal["csv", "table"] = "csv"
103
) -> None:
104
"""
105
Initialize XLSX document converter.
106
107
Args:
108
table_format: Format for table conversion ('csv' or 'table')
109
"""
110
111
def run(
112
self,
113
sources: List[Union[str, Path, ByteStream]],
114
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
115
) -> Dict[str, List[Document]]:
116
"""Convert Excel files to Document objects."""
117
118
class MSGToDocument:
119
def __init__(self) -> None:
120
"""Initialize MSG (Outlook message) document converter."""
121
122
def run(
123
self,
124
sources: List[Union[str, Path, ByteStream]],
125
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
126
) -> Dict[str, List[Document]]:
127
"""Convert MSG files to Document objects."""
128
```
129
130
### Web Content Processing
131
132
Extract and convert web content and markup formats.
133
134
```python { .api }
135
class HTMLToDocument:
136
def __init__(
137
self,
138
extractor_type: Literal["trafilatura", "default"] = "trafilatura",
139
extraction_kwargs: Optional[Dict[str, Any]] = None
140
) -> None:
141
"""
142
Initialize HTML document converter.
143
144
Args:
145
extractor_type: HTML extraction backend to use
146
extraction_kwargs: Additional extraction parameters
147
"""
148
149
def run(
150
self,
151
sources: List[Union[str, Path, ByteStream]],
152
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
153
) -> Dict[str, List[Document]]:
154
"""
155
Convert HTML files to Document objects.
156
157
Args:
158
sources: List of HTML file paths, URLs, or ByteStream objects
159
meta: Optional metadata to attach to documents
160
161
Returns:
162
Dictionary with 'documents' key containing converted documents
163
"""
164
165
class MarkdownToDocument:
166
def __init__(self) -> None:
167
"""Initialize Markdown document converter."""
168
169
def run(
170
self,
171
sources: List[Union[str, Path, ByteStream]],
172
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
173
) -> Dict[str, List[Document]]:
174
"""Convert Markdown files to Document objects."""
175
```
176
177
### Text and Data Processing
178
179
Handle plain text files and structured data formats.
180
181
```python { .api }
182
class TextFileToDocument:
183
def __init__(
184
self,
185
encoding: str = "utf-8"
186
) -> None:
187
"""
188
Initialize text file converter.
189
190
Args:
191
encoding: Character encoding for text files
192
"""
193
194
def run(
195
self,
196
sources: List[Union[str, Path, ByteStream]],
197
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
198
) -> Dict[str, List[Document]]:
199
"""
200
Convert text files to Document objects.
201
202
Args:
203
sources: List of text file paths or ByteStream objects
204
meta: Optional metadata to attach to documents
205
206
Returns:
207
Dictionary with 'documents' key containing converted documents
208
"""
209
210
class CSVToDocument:
211
def __init__(
212
self,
213
delimiter: str = ",",
214
quotechar: str = '"',
215
encoding: str = "utf-8"
216
) -> None:
217
"""
218
Initialize CSV document converter.
219
220
Args:
221
delimiter: CSV field delimiter
222
quotechar: CSV quote character
223
encoding: Character encoding for CSV files
224
"""
225
226
def run(
227
self,
228
sources: List[Union[str, Path, ByteStream]],
229
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
230
) -> Dict[str, List[Document]]:
231
"""Convert CSV files to Document objects."""
232
233
class JSONConverter:
234
def __init__(
235
self,
236
jq_schema: str = ".",
237
content_key: Optional[str] = None,
238
extra_meta_fields: Optional[List[str]] = None
239
) -> None:
240
"""
241
Initialize JSON converter.
242
243
Args:
244
jq_schema: JQ query string for data extraction
245
content_key: JSON key containing document content
246
extra_meta_fields: Additional fields to extract as metadata
247
"""
248
249
def run(
250
self,
251
sources: List[Union[str, Path, ByteStream]],
252
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
253
) -> Dict[str, List[Document]]:
254
"""Convert JSON files to Document objects."""
255
```
256
257
### Multi-Format Processing
258
259
Handle multiple file formats with automatic format detection.
260
261
```python { .api }
262
class MultiFileConverter:
263
def __init__(
264
self,
265
file_converters: Dict[str, Any] = None,
266
fallback_converter: Optional[Any] = None
267
) -> None:
268
"""
269
Initialize multi-format file converter.
270
271
Args:
272
file_converters: Dictionary mapping file extensions to converter instances
273
fallback_converter: Default converter for unrecognized file types
274
"""
275
276
def run(
277
self,
278
sources: List[Union[str, Path, ByteStream]],
279
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
280
) -> Dict[str, List[Document]]:
281
"""
282
Convert files using appropriate converters based on file type.
283
284
Args:
285
sources: List of file paths or ByteStream objects
286
meta: Optional metadata to attach to documents
287
288
Returns:
289
Dictionary with 'documents' key containing converted documents
290
"""
291
```
292
293
### OCR and Advanced Processing
294
295
Extract text from images and scanned documents using OCR.
296
297
```python { .api }
298
class AzureOCRDocumentConverter:
299
def __init__(
300
self,
301
endpoint: str,
302
api_key: Secret,
303
model_id: str = "prebuilt-read"
304
) -> None:
305
"""
306
Initialize Azure OCR document converter.
307
308
Args:
309
endpoint: Azure Form Recognizer endpoint
310
api_key: Azure Form Recognizer API key
311
model_id: OCR model to use
312
"""
313
314
def run(
315
self,
316
sources: List[Union[str, Path, ByteStream]],
317
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
318
) -> Dict[str, List[Document]]:
319
"""
320
Convert images and scanned documents using Azure OCR.
321
322
Args:
323
sources: List of image file paths or ByteStream objects
324
meta: Optional metadata to attach to documents
325
326
Returns:
327
Dictionary with 'documents' key containing OCR-extracted text
328
"""
329
330
class TikaDocumentConverter:
331
def __init__(
332
self,
333
tika_url: str = "http://localhost:9998/tika"
334
) -> None:
335
"""
336
Initialize Apache Tika document converter.
337
338
Args:
339
tika_url: URL of the Tika server
340
"""
341
342
def run(
343
self,
344
sources: List[Union[str, Path, ByteStream]],
345
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
346
) -> Dict[str, List[Document]]:
347
"""Convert various file formats using Apache Tika."""
348
```
349
350
### Document Splitting and Preprocessing
351
352
Split documents into smaller chunks and clean text for better retrieval performance.
353
354
```python { .api }
355
class DocumentSplitter:
356
def __init__(
357
self,
358
split_by: Literal["word", "sentence", "passage", "page"] = "word",
359
split_length: int = 200,
360
split_overlap: int = 0
361
) -> None:
362
"""
363
Initialize document splitter.
364
365
Args:
366
split_by: Unit to split by
367
split_length: Length of each split
368
split_overlap: Overlap between consecutive splits
369
"""
370
371
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
372
"""
373
Split documents into smaller chunks.
374
375
Args:
376
documents: List of documents to split
377
378
Returns:
379
Dictionary with 'documents' key containing split documents
380
"""
381
382
class RecursiveDocumentSplitter:
383
def __init__(
384
self,
385
chunk_size: int = 200,
386
chunk_overlap: int = 0,
387
separators: Optional[List[str]] = None,
388
keep_separator: bool = True,
389
respect_sentence_boundary: bool = False,
390
language: str = "en"
391
) -> None:
392
"""
393
Initialize recursive document splitter.
394
395
Args:
396
chunk_size: Target size for each chunk
397
chunk_overlap: Overlap between chunks
398
separators: List of separators to try in order
399
keep_separator: Whether to keep separators in chunks
400
respect_sentence_boundary: Whether to respect sentence boundaries
401
language: Language for sentence boundary detection
402
"""
403
404
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
405
"""Split documents recursively using multiple separators."""
406
407
class HierarchicalDocumentSplitter:
408
def __init__(
409
self,
410
chunk_sizes: List[int] = None,
411
chunk_overlap: int = 0,
412
separators: Optional[Dict[int, List[str]]] = None
413
) -> None:
414
"""
415
Initialize hierarchical document splitter.
416
417
Args:
418
chunk_sizes: List of chunk sizes for different hierarchy levels
419
chunk_overlap: Overlap between chunks
420
separators: Separators for each hierarchy level
421
"""
422
423
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
424
"""Split documents hierarchically at multiple levels."""
425
426
class DocumentCleaner:
427
def __init__(
428
self,
429
remove_extra_whitespaces: bool = True,
430
remove_repeated_substrings: bool = False,
431
remove_substrings: Optional[List[str]] = None,
432
remove_regex: Optional[str] = None,
433
unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None
434
) -> None:
435
"""
436
Initialize document cleaner.
437
438
Args:
439
remove_extra_whitespaces: Remove extra whitespace characters
440
remove_repeated_substrings: Remove repeated substrings
441
remove_substrings: Specific substrings to remove
442
remove_regex: Regex pattern for content removal
443
unicode_normalization: Unicode normalization form
444
"""
445
446
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
447
"""
448
Clean and normalize document content.
449
450
Args:
451
documents: List of documents to clean
452
453
Returns:
454
Dictionary with 'documents' key containing cleaned documents
455
"""
456
457
class TextCleaner:
458
def __init__(
459
self,
460
remove_extra_whitespaces: bool = True,
461
remove_repeated_substrings: bool = False,
462
remove_substrings: Optional[List[str]] = None,
463
remove_regex: Optional[str] = None,
464
unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None
465
) -> None:
466
"""Initialize text cleaner with same parameters as DocumentCleaner."""
467
468
def run(self, text: str) -> Dict[str, str]:
469
"""
470
Clean and normalize text content.
471
472
Args:
473
text: Input text to clean
474
475
Returns:
476
Dictionary with 'text' key containing cleaned text
477
"""
478
```
479
480
## Usage Examples
481
482
### Basic Document Conversion
483
484
```python
485
from haystack.components.converters import PyPDFToDocument
486
from pathlib import Path
487
488
# Initialize PDF converter
489
converter = PyPDFToDocument()
490
491
# Convert PDF files
492
pdf_files = ["document1.pdf", "document2.pdf"]
493
result = converter.run(sources=pdf_files)
494
495
documents = result["documents"]
496
for doc in documents:
497
print(f"Content: {doc.content[:100]}...")
498
print(f"Metadata: {doc.meta}")
499
print()
500
```
501
502
### Multi-Format Processing Pipeline
503
504
```python
505
from haystack import Pipeline
506
from haystack.components.converters import MultiFileConverter, PyPDFToDocument, HTMLToDocument, TextFileToDocument
507
from haystack.components.preprocessors import DocumentSplitter
508
509
# Set up converters for different file types
510
file_converters = {
511
".pdf": PyPDFToDocument(),
512
".html": HTMLToDocument(),
513
".txt": TextFileToDocument()
514
}
515
516
# Create pipeline
517
pipeline = Pipeline()
518
pipeline.add_component("converter", MultiFileConverter(file_converters=file_converters))
519
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
520
521
# Connect components
522
pipeline.connect("converter.documents", "splitter.documents")
523
524
# Process mixed file types
525
mixed_files = ["report.pdf", "webpage.html", "notes.txt"]
526
result = pipeline.run({"converter": {"sources": mixed_files}})
527
528
split_documents = result["splitter"]["documents"]
529
print(f"Processed {len(split_documents)} document chunks")
530
```
531
532
### Advanced Text Preprocessing
533
534
```python
535
from haystack.components.converters import HTMLToDocument
536
from haystack.components.preprocessors import DocumentCleaner, RecursiveDocumentSplitter
537
538
# Create preprocessing pipeline
539
pipeline = Pipeline()
540
pipeline.add_component("converter", HTMLToDocument())
541
pipeline.add_component("cleaner", DocumentCleaner(
542
remove_extra_whitespaces=True,
543
remove_repeated_substrings=True,
544
remove_regex=r'\[.*?\]' # Remove content in brackets
545
))
546
pipeline.add_component("splitter", RecursiveDocumentSplitter(
547
chunk_size=300,
548
chunk_overlap=50,
549
respect_sentence_boundary=True
550
))
551
552
# Connect components
553
pipeline.connect("converter.documents", "cleaner.documents")
554
pipeline.connect("cleaner.documents", "splitter.documents")
555
556
# Process HTML content
557
html_files = ["article.html", "blog_post.html"]
558
result = pipeline.run({"converter": {"sources": html_files}})
559
560
processed_docs = result["splitter"]["documents"]
561
for doc in processed_docs[:3]: # Show first 3 chunks
562
print(f"Chunk: {doc.content}")
563
print(f"Length: {len(doc.content)}")
564
print("---")
565
```
566
567
### CSV Data Processing
568
569
```python
570
from haystack.components.converters import CSVToDocument
571
572
# Process CSV with custom parameters
573
csv_converter = CSVToDocument(
574
delimiter=";",
575
encoding="utf-8"
576
)
577
578
# Convert CSV files
579
result = csv_converter.run(sources=["data.csv"])
580
documents = result["documents"]
581
582
# Each row becomes a document
583
for doc in documents[:3]:
584
print(f"Row data: {doc.content}")
585
print(f"Metadata: {doc.meta}")
586
print()
587
```
588
589
## Types
590
591
```python { .api }
592
from typing import Union, List, Dict, Any, Optional, Literal
593
from pathlib import Path
594
from haystack import Document
595
from haystack.dataclasses import ByteStream
596
from haystack.utils import Secret
597
598
class Span:
599
start: int
600
end: int
601
```