LLM framework to build customizable, production-ready LLM applications.
—
Convert various file formats to Haystack Document objects and preprocess text for optimal retrieval. Supports PDF, HTML, Office documents, images, and text preprocessing operations.
Extract text and content from PDF files using different parsing backends.
class PyPDFToDocument:
def __init__(
self,
converter_name: str = "PyPDFToDocument",
extract_images: bool = False
) -> None:
"""
Initialize PyPDF document converter.
Args:
converter_name: Name identifier for the converter
extract_images: Whether to extract images from PDFs
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""
Convert PDF files to Document objects.
Args:
sources: List of file paths, Path objects, or ByteStream objects
meta: Optional metadata to attach to documents
Returns:
Dictionary with 'documents' key containing converted documents
"""
class PDFMinerToDocument:
def __init__(
self,
extract_images: bool = False,
laparams: Optional[Dict[str, Any]] = None
) -> None:
"""
Initialize PDFMiner document converter.
Args:
extract_images: Whether to extract images from PDFs
laparams: LAParams configuration for PDFMiner
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""Convert PDF files using PDFMiner backend."""Extract content from Microsoft Office documents and other office formats.
class DOCXToDocument:
def __init__(self) -> None:
"""Initialize DOCX document converter."""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""
Convert DOCX files to Document objects.
Args:
sources: List of DOCX file paths or ByteStream objects
meta: Optional metadata to attach to documents
Returns:
Dictionary with 'documents' key containing converted documents
"""
class PPTXToDocument:
def __init__(self) -> None:
"""Initialize PPTX document converter."""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""Convert PowerPoint files to Document objects."""
class XLSXToDocument:
def __init__(
self,
table_format: Literal["csv", "table"] = "csv"
) -> None:
"""
Initialize XLSX document converter.
Args:
table_format: Format for table conversion ('csv' or 'table')
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""Convert Excel files to Document objects."""
class MSGToDocument:
def __init__(self) -> None:
"""Initialize MSG (Outlook message) document converter."""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""Convert MSG files to Document objects."""Extract and convert web content and markup formats.
class HTMLToDocument:
def __init__(
self,
extractor_type: Literal["trafilatura", "default"] = "trafilatura",
extraction_kwargs: Optional[Dict[str, Any]] = None
) -> None:
"""
Initialize HTML document converter.
Args:
extractor_type: HTML extraction backend to use
extraction_kwargs: Additional extraction parameters
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""
Convert HTML files to Document objects.
Args:
sources: List of HTML file paths, URLs, or ByteStream objects
meta: Optional metadata to attach to documents
Returns:
Dictionary with 'documents' key containing converted documents
"""
class MarkdownToDocument:
def __init__(self) -> None:
"""Initialize Markdown document converter."""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""Convert Markdown files to Document objects."""Handle plain text files and structured data formats.
class TextFileToDocument:
def __init__(
self,
encoding: str = "utf-8"
) -> None:
"""
Initialize text file converter.
Args:
encoding: Character encoding for text files
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""
Convert text files to Document objects.
Args:
sources: List of text file paths or ByteStream objects
meta: Optional metadata to attach to documents
Returns:
Dictionary with 'documents' key containing converted documents
"""
class CSVToDocument:
def __init__(
self,
delimiter: str = ",",
quotechar: str = '"',
encoding: str = "utf-8"
) -> None:
"""
Initialize CSV document converter.
Args:
delimiter: CSV field delimiter
quotechar: CSV quote character
encoding: Character encoding for CSV files
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""Convert CSV files to Document objects."""
class JSONConverter:
def __init__(
self,
jq_schema: str = ".",
content_key: Optional[str] = None,
extra_meta_fields: Optional[List[str]] = None
) -> None:
"""
Initialize JSON converter.
Args:
jq_schema: JQ query string for data extraction
content_key: JSON key containing document content
extra_meta_fields: Additional fields to extract as metadata
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""Convert JSON files to Document objects."""Handle multiple file formats with automatic format detection.
class MultiFileConverter:
def __init__(
self,
file_converters: Dict[str, Any] = None,
fallback_converter: Optional[Any] = None
) -> None:
"""
Initialize multi-format file converter.
Args:
file_converters: Dictionary mapping file extensions to converter instances
fallback_converter: Default converter for unrecognized file types
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""
Convert files using appropriate converters based on file type.
Args:
sources: List of file paths or ByteStream objects
meta: Optional metadata to attach to documents
Returns:
Dictionary with 'documents' key containing converted documents
"""Extract text from images and scanned documents using OCR.
class AzureOCRDocumentConverter:
def __init__(
self,
endpoint: str,
api_key: Secret,
model_id: str = "prebuilt-read"
) -> None:
"""
Initialize Azure OCR document converter.
Args:
endpoint: Azure Form Recognizer endpoint
api_key: Azure Form Recognizer API key
model_id: OCR model to use
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""
Convert images and scanned documents using Azure OCR.
Args:
sources: List of image file paths or ByteStream objects
meta: Optional metadata to attach to documents
Returns:
Dictionary with 'documents' key containing OCR-extracted text
"""
class TikaDocumentConverter:
def __init__(
self,
tika_url: str = "http://localhost:9998/tika"
) -> None:
"""
Initialize Apache Tika document converter.
Args:
tika_url: URL of the Tika server
"""
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
) -> Dict[str, List[Document]]:
"""Convert various file formats using Apache Tika."""Split documents into smaller chunks and clean text for better retrieval performance.
class DocumentSplitter:
def __init__(
self,
split_by: Literal["word", "sentence", "passage", "page"] = "word",
split_length: int = 200,
split_overlap: int = 0
) -> None:
"""
Initialize document splitter.
Args:
split_by: Unit to split by
split_length: Length of each split
split_overlap: Overlap between consecutive splits
"""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""
Split documents into smaller chunks.
Args:
documents: List of documents to split
Returns:
Dictionary with 'documents' key containing split documents
"""
class RecursiveDocumentSplitter:
def __init__(
self,
chunk_size: int = 200,
chunk_overlap: int = 0,
separators: Optional[List[str]] = None,
keep_separator: bool = True,
respect_sentence_boundary: bool = False,
language: str = "en"
) -> None:
"""
Initialize recursive document splitter.
Args:
chunk_size: Target size for each chunk
chunk_overlap: Overlap between chunks
separators: List of separators to try in order
keep_separator: Whether to keep separators in chunks
respect_sentence_boundary: Whether to respect sentence boundaries
language: Language for sentence boundary detection
"""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""Split documents recursively using multiple separators."""
class HierarchicalDocumentSplitter:
def __init__(
self,
chunk_sizes: List[int] = None,
chunk_overlap: int = 0,
separators: Optional[Dict[int, List[str]]] = None
) -> None:
"""
Initialize hierarchical document splitter.
Args:
chunk_sizes: List of chunk sizes for different hierarchy levels
chunk_overlap: Overlap between chunks
separators: Separators for each hierarchy level
"""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""Split documents hierarchically at multiple levels."""
class DocumentCleaner:
def __init__(
self,
remove_extra_whitespaces: bool = True,
remove_repeated_substrings: bool = False,
remove_substrings: Optional[List[str]] = None,
remove_regex: Optional[str] = None,
unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None
) -> None:
"""
Initialize document cleaner.
Args:
remove_extra_whitespaces: Remove extra whitespace characters
remove_repeated_substrings: Remove repeated substrings
remove_substrings: Specific substrings to remove
remove_regex: Regex pattern for content removal
unicode_normalization: Unicode normalization form
"""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""
Clean and normalize document content.
Args:
documents: List of documents to clean
Returns:
Dictionary with 'documents' key containing cleaned documents
"""
class TextCleaner:
def __init__(
self,
remove_extra_whitespaces: bool = True,
remove_repeated_substrings: bool = False,
remove_substrings: Optional[List[str]] = None,
remove_regex: Optional[str] = None,
unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None
) -> None:
"""Initialize text cleaner with same parameters as DocumentCleaner."""
def run(self, text: str) -> Dict[str, str]:
"""
Clean and normalize text content.
Args:
text: Input text to clean
Returns:
Dictionary with 'text' key containing cleaned text
"""from haystack.components.converters import PyPDFToDocument
from pathlib import Path
# Initialize PDF converter
converter = PyPDFToDocument()
# Convert PDF files
pdf_files = ["document1.pdf", "document2.pdf"]
result = converter.run(sources=pdf_files)
documents = result["documents"]
for doc in documents:
print(f"Content: {doc.content[:100]}...")
print(f"Metadata: {doc.meta}")
print()from haystack import Pipeline
from haystack.components.converters import MultiFileConverter, PyPDFToDocument, HTMLToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter
# Set up converters for different file types
file_converters = {
".pdf": PyPDFToDocument(),
".html": HTMLToDocument(),
".txt": TextFileToDocument()
}
# Create pipeline
pipeline = Pipeline()
pipeline.add_component("converter", MultiFileConverter(file_converters=file_converters))
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))
# Connect components
pipeline.connect("converter.documents", "splitter.documents")
# Process mixed file types
mixed_files = ["report.pdf", "webpage.html", "notes.txt"]
result = pipeline.run({"converter": {"sources": mixed_files}})
split_documents = result["splitter"]["documents"]
print(f"Processed {len(split_documents)} document chunks")from haystack.components.converters import HTMLToDocument
from haystack.components.preprocessors import DocumentCleaner, RecursiveDocumentSplitter
# Create preprocessing pipeline
pipeline = Pipeline()
pipeline.add_component("converter", HTMLToDocument())
pipeline.add_component("cleaner", DocumentCleaner(
remove_extra_whitespaces=True,
remove_repeated_substrings=True,
remove_regex=r'\[.*?\]' # Remove content in brackets
))
pipeline.add_component("splitter", RecursiveDocumentSplitter(
chunk_size=300,
chunk_overlap=50,
respect_sentence_boundary=True
))
# Connect components
pipeline.connect("converter.documents", "cleaner.documents")
pipeline.connect("cleaner.documents", "splitter.documents")
# Process HTML content
html_files = ["article.html", "blog_post.html"]
result = pipeline.run({"converter": {"sources": html_files}})
processed_docs = result["splitter"]["documents"]
for doc in processed_docs[:3]: # Show first 3 chunks
print(f"Chunk: {doc.content}")
print(f"Length: {len(doc.content)}")
print("---")from haystack.components.converters import CSVToDocument
# Process CSV with custom parameters
csv_converter = CSVToDocument(
delimiter=";",
encoding="utf-8"
)
# Convert CSV files
result = csv_converter.run(sources=["data.csv"])
documents = result["documents"]
# Each row becomes a document
for doc in documents[:3]:
print(f"Row data: {doc.content}")
print(f"Metadata: {doc.meta}")
print()from typing import Union, List, Dict, Any, Optional, Literal
from pathlib import Path
from haystack import Document
from haystack.dataclasses import ByteStream
from haystack.utils import Secret
class Span:
start: int
end: intInstall with Tessl CLI
npx tessl i tessl/pypi-haystack-ai