tessl/pypi-pdfplumber

Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.

—

Pending

Overview

Eval results

Files

Text Extraction

Name: tessl/pypi-pdfplumber
Author: tessl

Advanced text extraction capabilities with layout-aware algorithms, word detection, text search, character-level analysis, and comprehensive text processing options.

Capabilities

Layout-Aware Text Extraction

Primary text extraction method that preserves document layout and formatting using sophisticated algorithms.

def extract_text(x_tolerance=3, y_tolerance=3, layout=False, 
                x_density=7.25, y_density=13, **kwargs):
    """
    Extract text using layout-aware algorithm.
    
    Parameters:
    - x_tolerance: int or float - Horizontal tolerance for grouping characters
    - y_tolerance: int or float - Vertical tolerance for grouping characters  
    - layout: bool - Preserve layout with whitespace and positioning
    - x_density: float - Horizontal character density for layout
    - y_density: float - Vertical character density for layout
    - **kwargs: Additional text processing options
    
    Returns:
    str: Extracted text with layout preservation
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Basic text extraction
    text = page.extract_text()
    print(text)
    
    # Layout-preserving extraction
    formatted_text = page.extract_text(layout=True)
    print(formatted_text)
    
    # Fine-tuned character grouping
    precise_text = page.extract_text(x_tolerance=1, y_tolerance=1)
    print(precise_text)

    # Custom density for layout reconstruction
    spaced_text = page.extract_text(layout=True, x_density=10, y_density=15)
    print(spaced_text)

Simple Text Extraction

Streamlined text extraction without complex layout analysis for performance-critical applications.

def extract_text_simple(**kwargs):
    """
    Extract text using simple algorithm.
    
    Parameters:
    - **kwargs: Text processing options
    
    Returns:
    str: Extracted text without layout preservation
    """

Word Extraction

Extract words as objects with detailed position and formatting information.

def extract_words(x_tolerance=3, y_tolerance=3, keep_blank_chars=False,
                  use_text_flow=False, horizontal_ltr=True, vertical_ttb=True,
                  extra_attrs=None, split_at_punctuation=False, **kwargs):
    """
    Extract words as objects with position data.
    
    Parameters:
    - x_tolerance: int or float - Horizontal tolerance for word boundaries
    - y_tolerance: int or float - Vertical tolerance for word boundaries
    - keep_blank_chars: bool - Include blank character objects
    - use_text_flow: bool - Use text flow direction for word detection
    - horizontal_ltr: bool - Left-to-right reading order for horizontal text
    - vertical_ttb: bool - Top-to-bottom reading order for vertical text
    - extra_attrs: List[str] - Additional attributes to include in word objects
    - split_at_punctuation: bool - Split words at punctuation marks
    - **kwargs: Additional word processing options
    
    Returns:
    List[Dict[str, Any]]: List of word objects with position and formatting
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Extract words with position data
    words = page.extract_words()
    for word in words:
        print(f"'{word['text']}' at ({word['x0']}, {word['top']})")
    
    # Extract words with custom tolerances
    tight_words = page.extract_words(x_tolerance=1, y_tolerance=1)
    
    # Include font information
    detailed_words = page.extract_words(extra_attrs=['fontname', 'size'])
    for word in detailed_words:
        print(f"'{word['text']}' - Font: {word.get('fontname', 'Unknown')} Size: {word.get('size', 'Unknown')}")

Text Line Extraction

Extract text organized by lines with character-level details and line-level formatting.

def extract_text_lines(strip=True, return_chars=True, **kwargs):
    """
    Extract text lines with character details.
    
    Parameters:
    - strip: bool - Strip whitespace from line text
    - return_chars: bool - Include character objects in line data
    - **kwargs: Additional line processing options
    
    Returns:
    List[Dict[str, Any]]: List of line objects with text and character data
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Extract text lines
    lines = page.extract_text_lines()
    for line in lines:
        print(f"Line: '{line['text']}' at y={line['top']}")
        print(f"  Contains {len(line.get('chars', []))} characters")
    
    # Extract lines without character details
    simple_lines = page.extract_text_lines(return_chars=False)
    for line in simple_lines:
        print(line['text'])

Text Search

Advanced text search with regex support, case sensitivity options, and detailed match information.

def search(pattern, regex=True, case=True, main_group=0, 
           return_chars=True, return_groups=True, **kwargs):
    """
    Search for text patterns with regex support.
    
    Parameters:
    - pattern: str - Search pattern (literal text or regex)
    - regex: bool - Treat pattern as regular expression
    - case: bool - Case-sensitive search
    - main_group: int - Primary regex group for match extraction
    - return_chars: bool - Include character objects in matches
    - return_groups: bool - Include regex group information
    - **kwargs: Additional search options
    
    Returns:
    List[Dict[str, Any]]: List of match objects with position and text data
    """

Usage Examples:

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Simple text search
    matches = page.search("invoice")
    for match in matches:
        print(f"Found '{match['text']}' at ({match['x0']}, {match['top']})")
    
    # Regex search with groups
    email_matches = page.search(r'(\w+)@(\w+\.\w+)', regex=True)
    for match in email_matches:
        print(f"Email: {match['text']}")
        print(f"Groups: {match.get('groups', [])}")
    
    # Case-insensitive search
    ci_matches = page.search("TOTAL", case=False)
    
    # Search with character details
    detailed_matches = page.search("amount", return_chars=True)
    for match in detailed_matches:
        chars = match.get('chars', [])
        print(f"Match uses {len(chars)} characters")

Character Processing

Low-level character processing and deduplication functions.

def dedupe_chars(tolerance=1, use_text_flow=False, **kwargs):
    """
    Remove duplicate characters.
    
    Parameters:
    - tolerance: int or float - Distance tolerance for duplicate detection
    - use_text_flow: bool - Consider text flow in deduplication
    - **kwargs: Additional deduplication options
    
    Returns:
    Page: New page object with deduplicated characters
    """

Utility Text Functions

Standalone text processing functions available in the utils module.

# From pdfplumber.utils
def extract_text(chars, **kwargs):
    """Extract text from character objects."""

def extract_text_simple(chars, **kwargs):
    """Simple text extraction from characters."""

def extract_words(chars, **kwargs):
    """Extract words from character objects."""

def dedupe_chars(chars, tolerance=1, **kwargs):
    """Remove duplicate characters from list."""

def chars_to_textmap(chars, **kwargs):
    """Convert characters to TextMap object."""

def collate_line(chars, **kwargs):
    """Collate characters into text line."""

Text Processing Constants:

# Default tolerance values
DEFAULT_X_TOLERANCE = 3
DEFAULT_Y_TOLERANCE = 3
DEFAULT_X_DENSITY = 7.25
DEFAULT_Y_DENSITY = 13

TextMap Class

Advanced text mapping object for character-level text analysis.

class TextMap:
    """Character-level text mapping with position data."""
    
    def __init__(self, chars, **kwargs):
        """Initialize TextMap from character objects."""
    
    def as_list(self):
        """Convert to list representation."""
    
    def as_string(self):
        """Convert to string representation."""

Usage Examples:

from pdfplumber.utils import chars_to_textmap

with pdfplumber.open("document.pdf") as pdf:
    page = pdf.pages[0]
    
    # Create TextMap from page characters
    textmap = chars_to_textmap(page.chars)
    
    # Convert to different representations
    text_list = textmap.as_list()
    text_string = textmap.as_string()
    
    print(f"TextMap contains {len(text_list)} text elements")
    print(f"Combined text: {text_string}")

Install with Tessl CLI