or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agent-framework.mdcore-framework.mddocument-processing.mddocument-stores.mdevaluation.mdindex.mdprompt-building.mdretrieval.mdtext-embeddings.mdtext-generation.md

document-processing.mddocs/

0

# Document Processing

1

2

Convert various file formats to Haystack Document objects and preprocess text for optimal retrieval. Supports PDF, HTML, Office documents, images, and text preprocessing operations.

3

4

## Capabilities

5

6

### PDF Processing

7

8

Extract text and content from PDF files using different parsing backends.

9

10

```python { .api }

11

class PyPDFToDocument:

12

def __init__(

13

self,

14

converter_name: str = "PyPDFToDocument",

15

extract_images: bool = False

16

) -> None:

17

"""

18

Initialize PyPDF document converter.

19

20

Args:

21

converter_name: Name identifier for the converter

22

extract_images: Whether to extract images from PDFs

23

"""

24

25

def run(

26

self,

27

sources: List[Union[str, Path, ByteStream]],

28

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

29

) -> Dict[str, List[Document]]:

30

"""

31

Convert PDF files to Document objects.

32

33

Args:

34

sources: List of file paths, Path objects, or ByteStream objects

35

meta: Optional metadata to attach to documents

36

37

Returns:

38

Dictionary with 'documents' key containing converted documents

39

"""

40

41

class PDFMinerToDocument:

42

def __init__(

43

self,

44

extract_images: bool = False,

45

laparams: Optional[Dict[str, Any]] = None

46

) -> None:

47

"""

48

Initialize PDFMiner document converter.

49

50

Args:

51

extract_images: Whether to extract images from PDFs

52

laparams: LAParams configuration for PDFMiner

53

"""

54

55

def run(

56

self,

57

sources: List[Union[str, Path, ByteStream]],

58

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

59

) -> Dict[str, List[Document]]:

60

"""Convert PDF files using PDFMiner backend."""

61

```

62

63

### Office Document Processing

64

65

Extract content from Microsoft Office documents and other office formats.

66

67

```python { .api }

68

class DOCXToDocument:

69

def __init__(self) -> None:

70

"""Initialize DOCX document converter."""

71

72

def run(

73

self,

74

sources: List[Union[str, Path, ByteStream]],

75

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

76

) -> Dict[str, List[Document]]:

77

"""

78

Convert DOCX files to Document objects.

79

80

Args:

81

sources: List of DOCX file paths or ByteStream objects

82

meta: Optional metadata to attach to documents

83

84

Returns:

85

Dictionary with 'documents' key containing converted documents

86

"""

87

88

class PPTXToDocument:

89

def __init__(self) -> None:

90

"""Initialize PPTX document converter."""

91

92

def run(

93

self,

94

sources: List[Union[str, Path, ByteStream]],

95

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

96

) -> Dict[str, List[Document]]:

97

"""Convert PowerPoint files to Document objects."""

98

99

class XLSXToDocument:

100

def __init__(

101

self,

102

table_format: Literal["csv", "table"] = "csv"

103

) -> None:

104

"""

105

Initialize XLSX document converter.

106

107

Args:

108

table_format: Format for table conversion ('csv' or 'table')

109

"""

110

111

def run(

112

self,

113

sources: List[Union[str, Path, ByteStream]],

114

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

115

) -> Dict[str, List[Document]]:

116

"""Convert Excel files to Document objects."""

117

118

class MSGToDocument:

119

def __init__(self) -> None:

120

"""Initialize MSG (Outlook message) document converter."""

121

122

def run(

123

self,

124

sources: List[Union[str, Path, ByteStream]],

125

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

126

) -> Dict[str, List[Document]]:

127

"""Convert MSG files to Document objects."""

128

```

129

130

### Web Content Processing

131

132

Extract and convert web content and markup formats.

133

134

```python { .api }

135

class HTMLToDocument:

136

def __init__(

137

self,

138

extractor_type: Literal["trafilatura", "default"] = "trafilatura",

139

extraction_kwargs: Optional[Dict[str, Any]] = None

140

) -> None:

141

"""

142

Initialize HTML document converter.

143

144

Args:

145

extractor_type: HTML extraction backend to use

146

extraction_kwargs: Additional extraction parameters

147

"""

148

149

def run(

150

self,

151

sources: List[Union[str, Path, ByteStream]],

152

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

153

) -> Dict[str, List[Document]]:

154

"""

155

Convert HTML files to Document objects.

156

157

Args:

158

sources: List of HTML file paths, URLs, or ByteStream objects

159

meta: Optional metadata to attach to documents

160

161

Returns:

162

Dictionary with 'documents' key containing converted documents

163

"""

164

165

class MarkdownToDocument:

166

def __init__(self) -> None:

167

"""Initialize Markdown document converter."""

168

169

def run(

170

self,

171

sources: List[Union[str, Path, ByteStream]],

172

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

173

) -> Dict[str, List[Document]]:

174

"""Convert Markdown files to Document objects."""

175

```

176

177

### Text and Data Processing

178

179

Handle plain text files and structured data formats.

180

181

```python { .api }

182

class TextFileToDocument:

183

def __init__(

184

self,

185

encoding: str = "utf-8"

186

) -> None:

187

"""

188

Initialize text file converter.

189

190

Args:

191

encoding: Character encoding for text files

192

"""

193

194

def run(

195

self,

196

sources: List[Union[str, Path, ByteStream]],

197

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

198

) -> Dict[str, List[Document]]:

199

"""

200

Convert text files to Document objects.

201

202

Args:

203

sources: List of text file paths or ByteStream objects

204

meta: Optional metadata to attach to documents

205

206

Returns:

207

Dictionary with 'documents' key containing converted documents

208

"""

209

210

class CSVToDocument:

211

def __init__(

212

self,

213

delimiter: str = ",",

214

quotechar: str = '"',

215

encoding: str = "utf-8"

216

) -> None:

217

"""

218

Initialize CSV document converter.

219

220

Args:

221

delimiter: CSV field delimiter

222

quotechar: CSV quote character

223

encoding: Character encoding for CSV files

224

"""

225

226

def run(

227

self,

228

sources: List[Union[str, Path, ByteStream]],

229

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

230

) -> Dict[str, List[Document]]:

231

"""Convert CSV files to Document objects."""

232

233

class JSONConverter:

234

def __init__(

235

self,

236

jq_schema: str = ".",

237

content_key: Optional[str] = None,

238

extra_meta_fields: Optional[List[str]] = None

239

) -> None:

240

"""

241

Initialize JSON converter.

242

243

Args:

244

jq_schema: JQ query string for data extraction

245

content_key: JSON key containing document content

246

extra_meta_fields: Additional fields to extract as metadata

247

"""

248

249

def run(

250

self,

251

sources: List[Union[str, Path, ByteStream]],

252

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

253

) -> Dict[str, List[Document]]:

254

"""Convert JSON files to Document objects."""

255

```

256

257

### Multi-Format Processing

258

259

Handle multiple file formats with automatic format detection.

260

261

```python { .api }

262

class MultiFileConverter:

263

def __init__(

264

self,

265

file_converters: Dict[str, Any] = None,

266

fallback_converter: Optional[Any] = None

267

) -> None:

268

"""

269

Initialize multi-format file converter.

270

271

Args:

272

file_converters: Dictionary mapping file extensions to converter instances

273

fallback_converter: Default converter for unrecognized file types

274

"""

275

276

def run(

277

self,

278

sources: List[Union[str, Path, ByteStream]],

279

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

280

) -> Dict[str, List[Document]]:

281

"""

282

Convert files using appropriate converters based on file type.

283

284

Args:

285

sources: List of file paths or ByteStream objects

286

meta: Optional metadata to attach to documents

287

288

Returns:

289

Dictionary with 'documents' key containing converted documents

290

"""

291

```

292

293

### OCR and Advanced Processing

294

295

Extract text from images and scanned documents using OCR.

296

297

```python { .api }

298

class AzureOCRDocumentConverter:

299

def __init__(

300

self,

301

endpoint: str,

302

api_key: Secret,

303

model_id: str = "prebuilt-read"

304

) -> None:

305

"""

306

Initialize Azure OCR document converter.

307

308

Args:

309

endpoint: Azure Form Recognizer endpoint

310

api_key: Azure Form Recognizer API key

311

model_id: OCR model to use

312

"""

313

314

def run(

315

self,

316

sources: List[Union[str, Path, ByteStream]],

317

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

318

) -> Dict[str, List[Document]]:

319

"""

320

Convert images and scanned documents using Azure OCR.

321

322

Args:

323

sources: List of image file paths or ByteStream objects

324

meta: Optional metadata to attach to documents

325

326

Returns:

327

Dictionary with 'documents' key containing OCR-extracted text

328

"""

329

330

class TikaDocumentConverter:

331

def __init__(

332

self,

333

tika_url: str = "http://localhost:9998/tika"

334

) -> None:

335

"""

336

Initialize Apache Tika document converter.

337

338

Args:

339

tika_url: URL of the Tika server

340

"""

341

342

def run(

343

self,

344

sources: List[Union[str, Path, ByteStream]],

345

meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None

346

) -> Dict[str, List[Document]]:

347

"""Convert various file formats using Apache Tika."""

348

```

349

350

### Document Splitting and Preprocessing

351

352

Split documents into smaller chunks and clean text for better retrieval performance.

353

354

```python { .api }

355

class DocumentSplitter:

356

def __init__(

357

self,

358

split_by: Literal["word", "sentence", "passage", "page"] = "word",

359

split_length: int = 200,

360

split_overlap: int = 0

361

) -> None:

362

"""

363

Initialize document splitter.

364

365

Args:

366

split_by: Unit to split by

367

split_length: Length of each split

368

split_overlap: Overlap between consecutive splits

369

"""

370

371

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

372

"""

373

Split documents into smaller chunks.

374

375

Args:

376

documents: List of documents to split

377

378

Returns:

379

Dictionary with 'documents' key containing split documents

380

"""

381

382

class RecursiveDocumentSplitter:

383

def __init__(

384

self,

385

chunk_size: int = 200,

386

chunk_overlap: int = 0,

387

separators: Optional[List[str]] = None,

388

keep_separator: bool = True,

389

respect_sentence_boundary: bool = False,

390

language: str = "en"

391

) -> None:

392

"""

393

Initialize recursive document splitter.

394

395

Args:

396

chunk_size: Target size for each chunk

397

chunk_overlap: Overlap between chunks

398

separators: List of separators to try in order

399

keep_separator: Whether to keep separators in chunks

400

respect_sentence_boundary: Whether to respect sentence boundaries

401

language: Language for sentence boundary detection

402

"""

403

404

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

405

"""Split documents recursively using multiple separators."""

406

407

class HierarchicalDocumentSplitter:

408

def __init__(

409

self,

410

chunk_sizes: List[int] = None,

411

chunk_overlap: int = 0,

412

separators: Optional[Dict[int, List[str]]] = None

413

) -> None:

414

"""

415

Initialize hierarchical document splitter.

416

417

Args:

418

chunk_sizes: List of chunk sizes for different hierarchy levels

419

chunk_overlap: Overlap between chunks

420

separators: Separators for each hierarchy level

421

"""

422

423

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

424

"""Split documents hierarchically at multiple levels."""

425

426

class DocumentCleaner:

427

def __init__(

428

self,

429

remove_extra_whitespaces: bool = True,

430

remove_repeated_substrings: bool = False,

431

remove_substrings: Optional[List[str]] = None,

432

remove_regex: Optional[str] = None,

433

unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None

434

) -> None:

435

"""

436

Initialize document cleaner.

437

438

Args:

439

remove_extra_whitespaces: Remove extra whitespace characters

440

remove_repeated_substrings: Remove repeated substrings

441

remove_substrings: Specific substrings to remove

442

remove_regex: Regex pattern for content removal

443

unicode_normalization: Unicode normalization form

444

"""

445

446

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

447

"""

448

Clean and normalize document content.

449

450

Args:

451

documents: List of documents to clean

452

453

Returns:

454

Dictionary with 'documents' key containing cleaned documents

455

"""

456

457

class TextCleaner:

458

def __init__(

459

self,

460

remove_extra_whitespaces: bool = True,

461

remove_repeated_substrings: bool = False,

462

remove_substrings: Optional[List[str]] = None,

463

remove_regex: Optional[str] = None,

464

unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None

465

) -> None:

466

"""Initialize text cleaner with same parameters as DocumentCleaner."""

467

468

def run(self, text: str) -> Dict[str, str]:

469

"""

470

Clean and normalize text content.

471

472

Args:

473

text: Input text to clean

474

475

Returns:

476

Dictionary with 'text' key containing cleaned text

477

"""

478

```

479

480

## Usage Examples

481

482

### Basic Document Conversion

483

484

```python

485

from haystack.components.converters import PyPDFToDocument

486

from pathlib import Path

487

488

# Initialize PDF converter

489

converter = PyPDFToDocument()

490

491

# Convert PDF files

492

pdf_files = ["document1.pdf", "document2.pdf"]

493

result = converter.run(sources=pdf_files)

494

495

documents = result["documents"]

496

for doc in documents:

497

print(f"Content: {doc.content[:100]}...")

498

print(f"Metadata: {doc.meta}")

499

print()

500

```

501

502

### Multi-Format Processing Pipeline

503

504

```python

505

from haystack import Pipeline

506

from haystack.components.converters import MultiFileConverter, PyPDFToDocument, HTMLToDocument, TextFileToDocument

507

from haystack.components.preprocessors import DocumentSplitter

508

509

# Set up converters for different file types

510

file_converters = {

511

".pdf": PyPDFToDocument(),

512

".html": HTMLToDocument(),

513

".txt": TextFileToDocument()

514

}

515

516

# Create pipeline

517

pipeline = Pipeline()

518

pipeline.add_component("converter", MultiFileConverter(file_converters=file_converters))

519

pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))

520

521

# Connect components

522

pipeline.connect("converter.documents", "splitter.documents")

523

524

# Process mixed file types

525

mixed_files = ["report.pdf", "webpage.html", "notes.txt"]

526

result = pipeline.run({"converter": {"sources": mixed_files}})

527

528

split_documents = result["splitter"]["documents"]

529

print(f"Processed {len(split_documents)} document chunks")

530

```

531

532

### Advanced Text Preprocessing

533

534

```python

535

from haystack.components.converters import HTMLToDocument

536

from haystack.components.preprocessors import DocumentCleaner, RecursiveDocumentSplitter

537

538

# Create preprocessing pipeline

539

pipeline = Pipeline()

540

pipeline.add_component("converter", HTMLToDocument())

541

pipeline.add_component("cleaner", DocumentCleaner(

542

remove_extra_whitespaces=True,

543

remove_repeated_substrings=True,

544

remove_regex=r'\[.*?\]' # Remove content in brackets

545

))

546

pipeline.add_component("splitter", RecursiveDocumentSplitter(

547

chunk_size=300,

548

chunk_overlap=50,

549

respect_sentence_boundary=True

550

))

551

552

# Connect components

553

pipeline.connect("converter.documents", "cleaner.documents")

554

pipeline.connect("cleaner.documents", "splitter.documents")

555

556

# Process HTML content

557

html_files = ["article.html", "blog_post.html"]

558

result = pipeline.run({"converter": {"sources": html_files}})

559

560

processed_docs = result["splitter"]["documents"]

561

for doc in processed_docs[:3]: # Show first 3 chunks

562

print(f"Chunk: {doc.content}")

563

print(f"Length: {len(doc.content)}")

564

print("---")

565

```

566

567

### CSV Data Processing

568

569

```python

570

from haystack.components.converters import CSVToDocument

571

572

# Process CSV with custom parameters

573

csv_converter = CSVToDocument(

574

delimiter=";",

575

encoding="utf-8"

576

)

577

578

# Convert CSV files

579

result = csv_converter.run(sources=["data.csv"])

580

documents = result["documents"]

581

582

# Each row becomes a document

583

for doc in documents[:3]:

584

print(f"Row data: {doc.content}")

585

print(f"Metadata: {doc.meta}")

586

print()

587

```

588

589

## Types

590

591

```python { .api }

592

from typing import Union, List, Dict, Any, Optional, Literal

593

from pathlib import Path

594

from haystack import Document

595

from haystack.dataclasses import ByteStream

596

from haystack.utils import Secret

597

598

class Span:

599

start: int

600

end: int

601

```