tessl/pypi-haystack-ai

LLM framework to build customizable, production-ready LLM applications.

—

Pending

Overview

Eval results

Files

Document Processing

Name: tessl/pypi-haystack-ai
Author: tessl

Convert various file formats to Haystack Document objects and preprocess text for optimal retrieval. Supports PDF, HTML, Office documents, images, and text preprocessing operations.

Capabilities

PDF Processing

Extract text and content from PDF files using different parsing backends.

class PyPDFToDocument:
    def __init__(
        self,
        converter_name: str = "PyPDFToDocument",
        extract_images: bool = False
    ) -> None:
        """
        Initialize PyPDF document converter.
        
        Args:
            converter_name: Name identifier for the converter
            extract_images: Whether to extract images from PDFs
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """
        Convert PDF files to Document objects.
        
        Args:
            sources: List of file paths, Path objects, or ByteStream objects
            meta: Optional metadata to attach to documents
            
        Returns:
            Dictionary with 'documents' key containing converted documents
        """

class PDFMinerToDocument:
    def __init__(
        self,
        extract_images: bool = False,
        laparams: Optional[Dict[str, Any]] = None
    ) -> None:
        """
        Initialize PDFMiner document converter.
        
        Args:
            extract_images: Whether to extract images from PDFs
            laparams: LAParams configuration for PDFMiner
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """Convert PDF files using PDFMiner backend."""

Office Document Processing

Extract content from Microsoft Office documents and other office formats.

class DOCXToDocument:
    def __init__(self) -> None:
        """Initialize DOCX document converter."""

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """
        Convert DOCX files to Document objects.
        
        Args:
            sources: List of DOCX file paths or ByteStream objects
            meta: Optional metadata to attach to documents
            
        Returns:
            Dictionary with 'documents' key containing converted documents
        """

class PPTXToDocument:
    def __init__(self) -> None:
        """Initialize PPTX document converter."""

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """Convert PowerPoint files to Document objects."""

class XLSXToDocument:
    def __init__(
        self,
        table_format: Literal["csv", "table"] = "csv"
    ) -> None:
        """
        Initialize XLSX document converter.
        
        Args:
            table_format: Format for table conversion ('csv' or 'table')
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """Convert Excel files to Document objects."""

class MSGToDocument:
    def __init__(self) -> None:
        """Initialize MSG (Outlook message) document converter."""

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """Convert MSG files to Document objects."""

Web Content Processing

Extract and convert web content and markup formats.

class HTMLToDocument:
    def __init__(
        self,
        extractor_type: Literal["trafilatura", "default"] = "trafilatura",
        extraction_kwargs: Optional[Dict[str, Any]] = None
    ) -> None:
        """
        Initialize HTML document converter.
        
        Args:
            extractor_type: HTML extraction backend to use
            extraction_kwargs: Additional extraction parameters
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """
        Convert HTML files to Document objects.
        
        Args:
            sources: List of HTML file paths, URLs, or ByteStream objects
            meta: Optional metadata to attach to documents
            
        Returns:
            Dictionary with 'documents' key containing converted documents
        """

class MarkdownToDocument:
    def __init__(self) -> None:
        """Initialize Markdown document converter."""

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """Convert Markdown files to Document objects."""

Text and Data Processing

Handle plain text files and structured data formats.

class TextFileToDocument:
    def __init__(
        self,
        encoding: str = "utf-8"
    ) -> None:
        """
        Initialize text file converter.
        
        Args:
            encoding: Character encoding for text files
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """
        Convert text files to Document objects.
        
        Args:
            sources: List of text file paths or ByteStream objects
            meta: Optional metadata to attach to documents
            
        Returns:
            Dictionary with 'documents' key containing converted documents
        """

class CSVToDocument:
    def __init__(
        self,
        delimiter: str = ",",
        quotechar: str = '"',
        encoding: str = "utf-8"
    ) -> None:
        """
        Initialize CSV document converter.
        
        Args:
            delimiter: CSV field delimiter
            quotechar: CSV quote character
            encoding: Character encoding for CSV files
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """Convert CSV files to Document objects."""

class JSONConverter:
    def __init__(
        self,
        jq_schema: str = ".",
        content_key: Optional[str] = None,
        extra_meta_fields: Optional[List[str]] = None
    ) -> None:
        """
        Initialize JSON converter.
        
        Args:
            jq_schema: JQ query string for data extraction
            content_key: JSON key containing document content
            extra_meta_fields: Additional fields to extract as metadata
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """Convert JSON files to Document objects."""

Multi-Format Processing

Handle multiple file formats with automatic format detection.

class MultiFileConverter:
    def __init__(
        self,
        file_converters: Dict[str, Any] = None,
        fallback_converter: Optional[Any] = None
    ) -> None:
        """
        Initialize multi-format file converter.
        
        Args:
            file_converters: Dictionary mapping file extensions to converter instances
            fallback_converter: Default converter for unrecognized file types
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """
        Convert files using appropriate converters based on file type.
        
        Args:
            sources: List of file paths or ByteStream objects
            meta: Optional metadata to attach to documents
            
        Returns:
            Dictionary with 'documents' key containing converted documents
        """

OCR and Advanced Processing

Extract text from images and scanned documents using OCR.

class AzureOCRDocumentConverter:
    def __init__(
        self,
        endpoint: str,
        api_key: Secret,
        model_id: str = "prebuilt-read"
    ) -> None:
        """
        Initialize Azure OCR document converter.
        
        Args:
            endpoint: Azure Form Recognizer endpoint
            api_key: Azure Form Recognizer API key
            model_id: OCR model to use
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """
        Convert images and scanned documents using Azure OCR.
        
        Args:
            sources: List of image file paths or ByteStream objects
            meta: Optional metadata to attach to documents
            
        Returns:
            Dictionary with 'documents' key containing OCR-extracted text
        """

class TikaDocumentConverter:
    def __init__(
        self,
        tika_url: str = "http://localhost:9998/tika"
    ) -> None:
        """
        Initialize Apache Tika document converter.
        
        Args:
            tika_url: URL of the Tika server
        """

    def run(
        self,
        sources: List[Union[str, Path, ByteStream]],
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None
    ) -> Dict[str, List[Document]]:
        """Convert various file formats using Apache Tika."""

Document Splitting and Preprocessing

Split documents into smaller chunks and clean text for better retrieval performance.

class DocumentSplitter:
    def __init__(
        self,
        split_by: Literal["word", "sentence", "passage", "page"] = "word",
        split_length: int = 200,
        split_overlap: int = 0
    ) -> None:
        """
        Initialize document splitter.
        
        Args:
            split_by: Unit to split by
            split_length: Length of each split
            split_overlap: Overlap between consecutive splits
        """

    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
        """
        Split documents into smaller chunks.
        
        Args:
            documents: List of documents to split
            
        Returns:
            Dictionary with 'documents' key containing split documents
        """

class RecursiveDocumentSplitter:
    def __init__(
        self,
        chunk_size: int = 200,
        chunk_overlap: int = 0,
        separators: Optional[List[str]] = None,
        keep_separator: bool = True,
        respect_sentence_boundary: bool = False,
        language: str = "en"
    ) -> None:
        """
        Initialize recursive document splitter.
        
        Args:
            chunk_size: Target size for each chunk
            chunk_overlap: Overlap between chunks
            separators: List of separators to try in order
            keep_separator: Whether to keep separators in chunks
            respect_sentence_boundary: Whether to respect sentence boundaries
            language: Language for sentence boundary detection
        """

    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
        """Split documents recursively using multiple separators."""

class HierarchicalDocumentSplitter:
    def __init__(
        self,
        chunk_sizes: List[int] = None,
        chunk_overlap: int = 0,
        separators: Optional[Dict[int, List[str]]] = None
    ) -> None:
        """
        Initialize hierarchical document splitter.
        
        Args:
            chunk_sizes: List of chunk sizes for different hierarchy levels
            chunk_overlap: Overlap between chunks
            separators: Separators for each hierarchy level
        """

    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
        """Split documents hierarchically at multiple levels."""

class DocumentCleaner:
    def __init__(
        self,
        remove_extra_whitespaces: bool = True,
        remove_repeated_substrings: bool = False,
        remove_substrings: Optional[List[str]] = None,
        remove_regex: Optional[str] = None,
        unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None
    ) -> None:
        """
        Initialize document cleaner.
        
        Args:
            remove_extra_whitespaces: Remove extra whitespace characters
            remove_repeated_substrings: Remove repeated substrings
            remove_substrings: Specific substrings to remove
            remove_regex: Regex pattern for content removal
            unicode_normalization: Unicode normalization form
        """

    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
        """
        Clean and normalize document content.
        
        Args:
            documents: List of documents to clean
            
        Returns:
            Dictionary with 'documents' key containing cleaned documents
        """

class TextCleaner:
    def __init__(
        self,
        remove_extra_whitespaces: bool = True,
        remove_repeated_substrings: bool = False,
        remove_substrings: Optional[List[str]] = None,
        remove_regex: Optional[str] = None,
        unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None
    ) -> None:
        """Initialize text cleaner with same parameters as DocumentCleaner."""

    def run(self, text: str) -> Dict[str, str]:
        """
        Clean and normalize text content.
        
        Args:
            text: Input text to clean
            
        Returns:
            Dictionary with 'text' key containing cleaned text
        """

Usage Examples

Basic Document Conversion

from haystack.components.converters import PyPDFToDocument
from pathlib import Path

# Initialize PDF converter
converter = PyPDFToDocument()

# Convert PDF files
pdf_files = ["document1.pdf", "document2.pdf"]
result = converter.run(sources=pdf_files)

documents = result["documents"]
for doc in documents:
    print(f"Content: {doc.content[:100]}...")
    print(f"Metadata: {doc.meta}")
    print()

Multi-Format Processing Pipeline

from haystack import Pipeline
from haystack.components.converters import MultiFileConverter, PyPDFToDocument, HTMLToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter

# Set up converters for different file types
file_converters = {
    ".pdf": PyPDFToDocument(),
    ".html": HTMLToDocument(),
    ".txt": TextFileToDocument()
}

# Create pipeline
pipeline = Pipeline()
pipeline.add_component("converter", MultiFileConverter(file_converters=file_converters))
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=3))

# Connect components
pipeline.connect("converter.documents", "splitter.documents")

# Process mixed file types
mixed_files = ["report.pdf", "webpage.html", "notes.txt"]
result = pipeline.run({"converter": {"sources": mixed_files}})

split_documents = result["splitter"]["documents"]
print(f"Processed {len(split_documents)} document chunks")

Advanced Text Preprocessing

from haystack.components.converters import HTMLToDocument
from haystack.components.preprocessors import DocumentCleaner, RecursiveDocumentSplitter

# Create preprocessing pipeline
pipeline = Pipeline()
pipeline.add_component("converter", HTMLToDocument())
pipeline.add_component("cleaner", DocumentCleaner(
    remove_extra_whitespaces=True,
    remove_repeated_substrings=True,
    remove_regex=r'\[.*?\]'  # Remove content in brackets
))
pipeline.add_component("splitter", RecursiveDocumentSplitter(
    chunk_size=300,
    chunk_overlap=50,
    respect_sentence_boundary=True
))

# Connect components
pipeline.connect("converter.documents", "cleaner.documents")
pipeline.connect("cleaner.documents", "splitter.documents")

# Process HTML content
html_files = ["article.html", "blog_post.html"]
result = pipeline.run({"converter": {"sources": html_files}})

processed_docs = result["splitter"]["documents"]
for doc in processed_docs[:3]:  # Show first 3 chunks
    print(f"Chunk: {doc.content}")
    print(f"Length: {len(doc.content)}")
    print("---")

CSV Data Processing

from haystack.components.converters import CSVToDocument

# Process CSV with custom parameters
csv_converter = CSVToDocument(
    delimiter=";",
    encoding="utf-8"
)

# Convert CSV files
result = csv_converter.run(sources=["data.csv"])
documents = result["documents"]

# Each row becomes a document
for doc in documents[:3]:
    print(f"Row data: {doc.content}")
    print(f"Metadata: {doc.meta}")
    print()

Types

from typing import Union, List, Dict, Any, Optional, Literal
from pathlib import Path
from haystack import Document
from haystack.dataclasses import ByteStream
from haystack.utils import Secret

class Span:
    start: int
    end: int

Install with Tessl CLI