LLM framework to build customizable, production-ready LLM applications with pipelines connecting models, vector DBs, and data processors.
—
Document converters and preprocessors for handling PDF, DOCX, HTML, images, and other file formats with text extraction and cleaning.
from haystack.nodes import PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.nodes.file_converter.base import BaseConverterfrom haystack.nodes.file_converter.base import BaseConverter
from haystack.schema import Document
from pathlib import Path
from typing import List, Dict, Any, Optional
class BaseConverter:
def convert(self, file_path: Path, meta: Optional[Dict[str, Any]] = None,
encoding: Optional[str] = None, **kwargs) -> List[Document]:
"""
Convert file to Document objects.
Args:
file_path: Path to file to convert
meta: Additional metadata for documents
encoding: Text encoding for file reading
Returns:
List of Document objects with extracted content
"""from haystack.nodes import PDFToTextConverter
class PDFToTextConverter(BaseConverter):
def __init__(self, remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None):
"""
Initialize PDF to text converter.
Args:
remove_numeric_tables: Remove tables with mostly numeric content
valid_languages: List of valid languages for language detection
"""from haystack.nodes import DocxToTextConverter
class DocxToTextConverter(BaseConverter):
def __init__(self, remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None):
"""
Initialize DOCX to text converter.
Args:
remove_numeric_tables: Remove tables with mostly numeric content
valid_languages: List of valid languages for language detection
"""from haystack.nodes import PreProcessor
from haystack.nodes.base import BaseComponent
class PreProcessor(BaseComponent):
def __init__(self, clean_empty_lines: bool = True,
clean_whitespace: bool = True,
clean_header_footer: bool = False,
split_by: str = "word",
split_length: int = 1000,
split_overlap: int = 0,
split_respect_sentence_boundary: bool = True,
language: str = "en"):
"""
Initialize document preprocessor.
Args:
clean_empty_lines: Remove empty lines
clean_whitespace: Normalize whitespace
clean_header_footer: Remove headers/footers
split_by: Splitting unit ("word", "sentence", "page")
split_length: Length of splits
split_overlap: Overlap between splits
split_respect_sentence_boundary: Keep sentence boundaries
language: Language for sentence splitting
"""
def process(self, documents: List[Document]) -> List[Document]:
"""
Process and clean documents.
Args:
documents: List of documents to process
Returns:
List of processed Document objects
"""Install with Tessl CLI
npx tessl i tessl/pypi-farm-haystack