Tessl Tile for pypi/haystack-ai@2.17.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

agent-framework.md core-framework.md document-processing.md document-stores.md evaluation.md index.md prompt-building.md retrieval.md text-embeddings.md text-generation.md

document-processing.mddocs/

0
# Document Processing
1

2
Convert various file formats to Haystack Document objects and preprocess text for optimal retrieval. Supports PDF, HTML, Office documents, images, and text preprocessing operations.
3

4
## Capabilities
5

6
### PDF Processing
7

8
Extract text and content from PDF files using different parsing backends.
9

10
```python { .api }
11
class PyPDFToDocument:
12
    def __init__(
13
        self,
14
        converter_name: str = "PyPDFToDocument",
15
        extract_images: bool = False
16
    ) -> None:
17
        """
18
        Initialize PyPDF document converter.
19
        
20
        Args:
21
            converter_name: Name identifier for the converter
22
            extract_images: Whether to extract images from PDFs
23
        """
24

25
    def run(
26
        self,
27
        sources: List[Union[str, Path, ByteStream]],
28
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
29
    ) -> Dict[str, List[Document]]:
30
        """
31
        Convert PDF files to Document objects.
32
        
33
        Args:
34
            sources: List of file paths, Path objects, or ByteStream objects
35
            meta: Optional metadata to attach to documents
36
            
37
        Returns:
38
            Dictionary with 'documents' key containing converted documents
39
        """
40

41
class PDFMinerToDocument:
42
    def __init__(
43
        self,
44
        extract_images: bool = False,
45
        laparams: Optional[Dict[str, Any]] = None
46
    ) -> None:
47
        """
48
        Initialize PDFMiner document converter.
49
        
50
        Args:
51
            extract_images: Whether to extract images from PDFs
52
            laparams: LAParams configuration for PDFMiner
53
        """
54

55
    def run(
56
        self,
57
        sources: List[Union[str, Path, ByteStream]],
58
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
59
    ) -> Dict[str, List[Document]]:
60
        """Convert PDF files using PDFMiner backend."""
61
```
62

63
### Office Document Processing
64

65
Extract content from Microsoft Office documents and other office formats.
66

67
```python { .api }
68
class DOCXToDocument:
69
    def __init__(self) -> None:
70
        """Initialize DOCX document converter."""
71

72
    def run(
73
        self,
74
        sources: List[Union[str, Path, ByteStream]],
75
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
76
    ) -> Dict[str, List[Document]]:
77
        """
78
        Convert DOCX files to Document objects.
79
        
80
        Args:
81
            sources: List of DOCX file paths or ByteStream objects
82
            meta: Optional metadata to attach to documents
83
            
84
        Returns:
85
            Dictionary with 'documents' key containing converted documents
86
        """
87

88
class PPTXToDocument:
89
    def __init__(self) -> None:
90
        """Initialize PPTX document converter."""
91

92
    def run(
93
        self,
94
        sources: List[Union[str, Path, ByteStream]],
95
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
96
    ) -> Dict[str, List[Document]]:
97
        """Convert PowerPoint files to Document objects."""
98

99
class XLSXToDocument:
100
    def __init__(
101
        self,
102
        table_format: Literal["csv", "table"] = "csv"
103
    ) -> None:
104
        """
105
        Initialize XLSX document converter.
106
        
107
        Args:
108
            table_format: Format for table conversion ('csv' or 'table')
109
        """
110

111
    def run(
112
        self,
113
        sources: List[Union[str, Path, ByteStream]],
114
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
115
    ) -> Dict[str, List[Document]]:
116
        """Convert Excel files to Document objects."""
117

118
class MSGToDocument:
119
    def __init__(self) -> None:
120
        """Initialize MSG (Outlook message) document converter."""
121

122
    def run(
123
        self,
124
        sources: List[Union[str, Path, ByteStream]],
125
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
126
    ) -> Dict[str, List[Document]]:
127
        """Convert MSG files to Document objects."""
128
```
129

130
### Web Content Processing
131

132
Extract and convert web content and markup formats.
133

134
```python { .api }
135
class HTMLToDocument:
136
    def __init__(
137
        self,
138
        extractor_type: Literal["trafilatura", "default"] = "trafilatura",
139
        extraction_kwargs: Optional[Dict[str, Any]] = None
140
    ) -> None:
141
        """
142
        Initialize HTML document converter.
143
        
144
        Args:
145
            extractor_type: HTML extraction backend to use
146
            extraction_kwargs: Additional extraction parameters
147
        """
148

149
    def run(
150
        self,
151
        sources: List[Union[str, Path, ByteStream]],
152
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
153
    ) -> Dict[str, List[Document]]:
154
        """
155
        Convert HTML files to Document objects.
156
        
157
        Args:
158
            sources: List of HTML file paths, URLs, or ByteStream objects
159
            meta: Optional metadata to attach to documents
160
            
161
        Returns:
162
            Dictionary with 'documents' key containing converted documents
163
        """
164

165
class MarkdownToDocument:
166
    def __init__(self) -> None:
167
        """Initialize Markdown document converter."""
168

169
    def run(
170
        self,
171
        sources: List[Union[str, Path, ByteStream]],
172
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
173
    ) -> Dict[str, List[Document]]:
174
        """Convert Markdown files to Document objects."""
175
```
176

177
### Text and Data Processing
178

179
Handle plain text files and structured data formats.
180

181
```python { .api }
182
class TextFileToDocument:
183
    def __init__(
184
        self,
185
        encoding: str = "utf-8"
186
    ) -> None:
187
        """
188
        Initialize text file converter.
189
        
190
        Args:
191
            encoding: Character encoding for text files
192
        """
193

194
    def run(
195
        self,
196
        sources: List[Union[str, Path, ByteStream]],
197
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
198
    ) -> Dict[str, List[Document]]:
199
        """
200
        Convert text files to Document objects.
201
        
202
        Args:
203
            sources: List of text file paths or ByteStream objects
204
            meta: Optional metadata to attach to documents
205
            
206
        Returns:
207
            Dictionary with 'documents' key containing converted documents
208
        """
209

210
class CSVToDocument:
211
    def __init__(
212
        self,
213
        delimiter: str = ",",
214
        quotechar: str = '"',
215
        encoding: str = "utf-8"
216
    ) -> None:
217
        """
218
        Initialize CSV document converter.
219
        
220
        Args:
221
            delimiter: CSV field delimiter
222
            quotechar: CSV quote character
223
            encoding: Character encoding for CSV files
224
        """
225

226
    def run(
227
        self,
228
        sources: List[Union[str, Path, ByteStream]],
229
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
230
    ) -> Dict[str, List[Document]]:
231
        """Convert CSV files to Document objects."""
232

233
class JSONConverter:
234
    def __init__(
235
        self,
236
        jq_schema: str = ".",
237
        content_key: Optional[str] = None,
238
        extra_meta_fields: Optional[List[str]] = None
239
    ) -> None:
240
        """
241
        Initialize JSON converter.
242
        
243
        Args:
244
            jq_schema: JQ query string for data extraction
245
            content_key: JSON key containing document content
246
            extra_meta_fields: Additional fields to extract as metadata
247
        """
248

249
    def run(
250
        self,
251
        sources: List[Union[str, Path, ByteStream]],
252
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
253
    ) -> Dict[str, List[Document]]:
254
        """Convert JSON files to Document objects."""
255
```
256

257
### Multi-Format Processing
258

259
Handle multiple file formats with automatic format detection.
260

261
```python { .api }
262
class MultiFileConverter:
263
    def __init__(
264
        self,
265
        file_converters: Dict[str, Any] = None,
266
        fallback_converter: Optional[Any] = None
267
    ) -> None:
268
        """
269
        Initialize multi-format file converter.
270
        
271
        Args:
272
            file_converters: Dictionary mapping file extensions to converter instances
273
            fallback_converter: Default converter for unrecognized file types
274
        """
275

276
    def run(
277
        self,
278
        sources: List[Union[str, Path, ByteStream]],
279
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
280
    ) -> Dict[str, List[Document]]:
281
        """
282
        Convert files using appropriate converters based on file type.
283
        
284
        Args:
285
            sources: List of file paths or ByteStream objects
286
            meta: Optional metadata to attach to documents
287
            
288
        Returns:
289
            Dictionary with 'documents' key containing converted documents
290
        """
291
```
292

293
### OCR and Advanced Processing
294

295
Extract text from images and scanned documents using OCR.
296

297
```python { .api }
298
class AzureOCRDocumentConverter:
299
    def __init__(
300
        self,
301
        endpoint: str,
302
        api_key: Secret,
303
        model_id: str = "prebuilt-read"
304
    ) -> None:
305
        """
306
        Initialize Azure OCR document converter.
307
        
308
        Args:
309
            endpoint: Azure Form Recognizer endpoint
310
            api_key: Azure Form Recognizer API key
311
            model_id: OCR model to use
312
        """
313

314
    def run(
315
        self,
316
        sources: List[Union[str, Path, ByteStream]],
317
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
318
    ) -> Dict[str, List[Document]]:
319
        """
320
        Convert images and scanned documents using Azure OCR.
321
        
322
        Args:
323
            sources: List of image file paths or ByteStream objects
324
            meta: Optional metadata to attach to documents
325
            
326
        Returns:
327
            Dictionary with 'documents' key containing OCR-extracted text
328
        """
329

330
class TikaDocumentConverter:
331
    def __init__(
332
        self,
333
        tika_url: str = "http://localhost:9998/tika"
334
    ) -> None:
335
        """
336
        Initialize Apache Tika document converter.
337
        
338
        Args:
339
            tika_url: URL of the Tika server
340
        """
341

342
    def run(
343
        self,
344
        sources: List[Union[str, Path, ByteStream]],
345
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
346
    ) -> Dict[str, List[Document]]:
347
        """Convert various file formats using Apache Tika."""
348
```
349

350
### Document Splitting and Preprocessing
351

352
Split documents into smaller chunks and clean text for better retrieval performance.
353

354
```python { .api }
355
class DocumentSplitter:
356
    def __init__(
357
        self,
358
        split_by: Literal["word", "sentence", "passage", "page"] = "word",
359
        split_length: int = 200,
360
        split_overlap: int = 0
361
    ) -> None:
362
        """
363
        Initialize document splitter.
364
        
365
        Args:
366
            split_by: Unit to split by
367
            split_length: Length of each split
368
            split_overlap: Overlap between consecutive splits
369
        """
370

371
    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
372
        """
373
        Split documents into smaller chunks.
374
        
375
        Args:
376
            documents: List of documents to split
377
            
378
        Returns:
379
            Dictionary with 'documents' key containing split documents
380
        """
381

382
class RecursiveDocumentSplitter:
383
    def __init__(
384
        self,
385
        chunk_size: int = 200,
386
        chunk_overlap: int = 0,
387
        separators: Optional[List[str]] = None,
388
        keep_separator: bool = True,
389
        respect_sentence_boundary: bool = False,
390
        language: str = "en"
391
    ) -> None:
392
        """
393
        Initialize recursive document splitter.
394
        
395
        Args:
396
            chunk_size: Target size for each chunk
397
            chunk_overlap: Overlap between chunks
398
            separators: List of separators to try in order
399
            keep_separator: Whether to keep separators in chunks
400
            respect_sentence_boundary: Whether to respect sentence boundaries
401
            language: Language for sentence boundary detection
402
        """
403

404
    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
405
        """Split documents recursively using multiple separators."""
406

407
class HierarchicalDocumentSplitter:
408
    def __init__(
409
        self,
410
        chunk_sizes: List[int] = None,
411
        chunk_overlap: int = 0,
412
        separators: Optional[Dict[int, List[str]]] = None
413
    ) -> None:
414
        """
415
        Initialize hierarchical document splitter.
416
        
417
        Args:
418
            chunk_sizes: List of chunk sizes for different hierarchy levels
419
            chunk_overlap: Overlap between chunks
420
            separators: Separators for each hierarchy level
421
        """
422

423
    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
424
        """Split documents hierarchically at multiple levels."""
425

426
class DocumentCleaner:
427
    def __init__(
428
        self,
429
        remove_extra_whitespaces: bool = True,
430
        remove_repeated_substrings: bool = False,
431
        remove_substrings: Optional[List[str]] = None,
432
        remove_regex: Optional[str] = None,
433
        unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None
434
    ) -> None:
435
        """
436
        Initialize document cleaner.
437
        
438
        Args:
439
            remove_extra_whitespaces: Remove extra whitespace characters
440
            remove_repeated_substrings: Remove repeated substrings
441
            remove_substrings: Specific substrings to remove
442
            remove_regex: Regex pattern for content removal
443
            unicode_normalization: Unicode normalization form
444
        """
445

446
    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
447
        """
448
        Clean and normalize document content.
449
        
450
        Args:
451
            documents: List of documents to clean
452
            
453
        Returns:
454
            Dictionary with 'documents' key containing cleaned documents
455
        """
456

457
class TextCleaner:
458
    def __init__(
459
        self,
460
        remove_extra_whitespaces: bool = True,
461
        remove_repeated_substrings: bool = False,
462
        remove_substrings: Optional[List[str]] = None,
463
        remove_regex: Optional[str] = None,
464
        unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None
465
    ) -> None:
466
        """Initialize text cleaner with same parameters as DocumentCleaner."""
467

468
    def run(self, text: str) -> Dict[str, str]:
469
        """
470
        Clean and normalize text content.
471
        
472
        Args:
473
            text: Input text to clean
474
            
475
        Returns:
476
            Dictionary with 'text' key containing cleaned text
477
        """
478
```
479

480
## Usage Examples
481

482
### Basic Document Conversion
483

484
```python
485
from haystack.components.converters import PyPDFToDocument
486
from pathlib import Path
487

488
# Initialize PDF converter
489
converter = PyPDFToDocument()
490

491
# Convert PDF files
492
pdf_files = ["document1.pdf", "document2.pdf"]
493
result = converter.run(sources=pdf_files)
494

495
documents = result["documents"]
496
for doc in documents:
497
    print(f"Content: {doc.content[:100]}...")
498
    print(f"Metadata: {doc.meta}")
499
    print()
500
```
501

502
### Multi-Format Processing Pipeline
503

504
```python
505
from haystack import Pipeline
506
from haystack.components.converters import MultiFileConverter, PyPDFToDocument, HTMLToDocument, TextFileToDocument
507
from haystack.components.preprocessors import DocumentSplitter
508

509
# Set up converters for different file types
510
file_converters = {
511
    ".pdf": PyPDFToDocument(),
512
    ".html": HTMLToDocument(),
513
    ".txt": TextFileToDocument()
514
}
515

516
# Create pipeline
517
pipeline = Pipeline()
518
pipeline.add_component("converter", MultiFileConverter(file_converters=file_converters))
519
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
520

521
# Connect components
522
pipeline.connect("converter.documents", "splitter.documents")
523

524
# Process mixed file types
525
mixed_files = ["report.pdf", "webpage.html", "notes.txt"]
526
result = pipeline.run({"converter": {"sources": mixed_files}})
527

528
split_documents = result["splitter"]["documents"]
529
print(f"Processed {len(split_documents)} document chunks")
530
```
531

532
### Advanced Text Preprocessing
533

534
```python
535
from haystack.components.converters import HTMLToDocument
536
from haystack.components.preprocessors import DocumentCleaner, RecursiveDocumentSplitter
537

538
# Create preprocessing pipeline
539
pipeline = Pipeline()
540
pipeline.add_component("converter", HTMLToDocument())
541
pipeline.add_component("cleaner", DocumentCleaner(
542
    remove_extra_whitespaces=True,
543
    remove_repeated_substrings=True,
544
    remove_regex=r'\[.*?\]'  # Remove content in brackets
545
))
546
pipeline.add_component("splitter", RecursiveDocumentSplitter(
547
    chunk_size=300,
548
    chunk_overlap=50,
549
    respect_sentence_boundary=True
550
))
551

552
# Connect components
553
pipeline.connect("converter.documents", "cleaner.documents")
554
pipeline.connect("cleaner.documents", "splitter.documents")
555

556
# Process HTML content
557
html_files = ["article.html", "blog_post.html"]
558
result = pipeline.run({"converter": {"sources": html_files}})
559

560
processed_docs = result["splitter"]["documents"]
561
for doc in processed_docs[:3]:  # Show first 3 chunks
562
    print(f"Chunk: {doc.content}")
563
    print(f"Length: {len(doc.content)}")
564
    print("---")
565
```
566

567
### CSV Data Processing
568

569
```python
570
from haystack.components.converters import CSVToDocument
571

572
# Process CSV with custom parameters
573
csv_converter = CSVToDocument(
574
    delimiter=";",
575
    encoding="utf-8"
576
)
577

578
# Convert CSV files
579
result = csv_converter.run(sources=["data.csv"])
580
documents = result["documents"]
581

582
# Each row becomes a document
583
for doc in documents[:3]:
584
    print(f"Row data: {doc.content}")
585
    print(f"Metadata: {doc.meta}")
586
    print()
587
```
588

589
## Types
590

591
```python { .api }
592
from typing import Union, List, Dict, Any, Optional, Literal
593
from pathlib import Path
594
from haystack import Document
595
from haystack.dataclasses import ByteStream
596
from haystack.utils import Secret
597

598
class Span:
599
    start: int
600
    end: int
601
```

Version

Tile

Files

document-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

document-processing.mddocs/