tessl/pypi-farm-haystack

LLM framework to build customizable, production-ready LLM applications with pipelines connecting models, vector DBs, and data processors.

—

Pending

Overview

Eval results

Files

File Processing

Name: tessl/pypi-farm-haystack
Author: tessl

Document converters and preprocessors for handling PDF, DOCX, HTML, images, and other file formats with text extraction and cleaning.

Core Imports

from haystack.nodes import PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.nodes.file_converter.base import BaseConverter

Base Converter

from haystack.nodes.file_converter.base import BaseConverter
from haystack.schema import Document
from pathlib import Path
from typing import List, Dict, Any, Optional

class BaseConverter:
    def convert(self, file_path: Path, meta: Optional[Dict[str, Any]] = None,
                encoding: Optional[str] = None, **kwargs) -> List[Document]:
        """
        Convert file to Document objects.
        
        Args:
            file_path: Path to file to convert
            meta: Additional metadata for documents
            encoding: Text encoding for file reading
            
        Returns:
            List of Document objects with extracted content
        """

PDF Converter

from haystack.nodes import PDFToTextConverter

class PDFToTextConverter(BaseConverter):
    def __init__(self, remove_numeric_tables: bool = False,
                 valid_languages: Optional[List[str]] = None):
        """
        Initialize PDF to text converter.
        
        Args:
            remove_numeric_tables: Remove tables with mostly numeric content
            valid_languages: List of valid languages for language detection
        """

DOCX Converter

from haystack.nodes import DocxToTextConverter

class DocxToTextConverter(BaseConverter):
    def __init__(self, remove_numeric_tables: bool = False,
                 valid_languages: Optional[List[str]] = None):
        """
        Initialize DOCX to text converter.
        
        Args:
            remove_numeric_tables: Remove tables with mostly numeric content
            valid_languages: List of valid languages for language detection
        """

PreProcessor

from haystack.nodes import PreProcessor
from haystack.nodes.base import BaseComponent

class PreProcessor(BaseComponent):
    def __init__(self, clean_empty_lines: bool = True,
                 clean_whitespace: bool = True,
                 clean_header_footer: bool = False,
                 split_by: str = "word",
                 split_length: int = 1000,
                 split_overlap: int = 0,
                 split_respect_sentence_boundary: bool = True,
                 language: str = "en"):
        """
        Initialize document preprocessor.
        
        Args:
            clean_empty_lines: Remove empty lines
            clean_whitespace: Normalize whitespace
            clean_header_footer: Remove headers/footers
            split_by: Splitting unit ("word", "sentence", "page")
            split_length: Length of splits
            split_overlap: Overlap between splits
            split_respect_sentence_boundary: Keep sentence boundaries
            language: Language for sentence splitting
        """
    
    def process(self, documents: List[Document]) -> List[Document]:
        """
        Process and clean documents.
        
        Args:
            documents: List of documents to process
            
        Returns:
            List of processed Document objects
        """

Install with Tessl CLI