Detailed PDF analysis and extraction library with comprehensive table detection and visual debugging capabilities.
—
Advanced text extraction capabilities with layout-aware algorithms, word detection, text search, character-level analysis, and comprehensive text processing options.
Primary text extraction method that preserves document layout and formatting using sophisticated algorithms.
def extract_text(x_tolerance=3, y_tolerance=3, layout=False,
x_density=7.25, y_density=13, **kwargs):
"""
Extract text using layout-aware algorithm.
Parameters:
- x_tolerance: int or float - Horizontal tolerance for grouping characters
- y_tolerance: int or float - Vertical tolerance for grouping characters
- layout: bool - Preserve layout with whitespace and positioning
- x_density: float - Horizontal character density for layout
- y_density: float - Vertical character density for layout
- **kwargs: Additional text processing options
Returns:
str: Extracted text with layout preservation
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Basic text extraction
text = page.extract_text()
print(text)
# Layout-preserving extraction
formatted_text = page.extract_text(layout=True)
print(formatted_text)
# Fine-tuned character grouping
precise_text = page.extract_text(x_tolerance=1, y_tolerance=1)
print(precise_text)
# Custom density for layout reconstruction
spaced_text = page.extract_text(layout=True, x_density=10, y_density=15)
print(spaced_text)Streamlined text extraction without complex layout analysis for performance-critical applications.
def extract_text_simple(**kwargs):
"""
Extract text using simple algorithm.
Parameters:
- **kwargs: Text processing options
Returns:
str: Extracted text without layout preservation
"""Extract words as objects with detailed position and formatting information.
def extract_words(x_tolerance=3, y_tolerance=3, keep_blank_chars=False,
use_text_flow=False, horizontal_ltr=True, vertical_ttb=True,
extra_attrs=None, split_at_punctuation=False, **kwargs):
"""
Extract words as objects with position data.
Parameters:
- x_tolerance: int or float - Horizontal tolerance for word boundaries
- y_tolerance: int or float - Vertical tolerance for word boundaries
- keep_blank_chars: bool - Include blank character objects
- use_text_flow: bool - Use text flow direction for word detection
- horizontal_ltr: bool - Left-to-right reading order for horizontal text
- vertical_ttb: bool - Top-to-bottom reading order for vertical text
- extra_attrs: List[str] - Additional attributes to include in word objects
- split_at_punctuation: bool - Split words at punctuation marks
- **kwargs: Additional word processing options
Returns:
List[Dict[str, Any]]: List of word objects with position and formatting
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Extract words with position data
words = page.extract_words()
for word in words:
print(f"'{word['text']}' at ({word['x0']}, {word['top']})")
# Extract words with custom tolerances
tight_words = page.extract_words(x_tolerance=1, y_tolerance=1)
# Include font information
detailed_words = page.extract_words(extra_attrs=['fontname', 'size'])
for word in detailed_words:
print(f"'{word['text']}' - Font: {word.get('fontname', 'Unknown')} Size: {word.get('size', 'Unknown')}")Extract text organized by lines with character-level details and line-level formatting.
def extract_text_lines(strip=True, return_chars=True, **kwargs):
"""
Extract text lines with character details.
Parameters:
- strip: bool - Strip whitespace from line text
- return_chars: bool - Include character objects in line data
- **kwargs: Additional line processing options
Returns:
List[Dict[str, Any]]: List of line objects with text and character data
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Extract text lines
lines = page.extract_text_lines()
for line in lines:
print(f"Line: '{line['text']}' at y={line['top']}")
print(f" Contains {len(line.get('chars', []))} characters")
# Extract lines without character details
simple_lines = page.extract_text_lines(return_chars=False)
for line in simple_lines:
print(line['text'])Advanced text search with regex support, case sensitivity options, and detailed match information.
def search(pattern, regex=True, case=True, main_group=0,
return_chars=True, return_groups=True, **kwargs):
"""
Search for text patterns with regex support.
Parameters:
- pattern: str - Search pattern (literal text or regex)
- regex: bool - Treat pattern as regular expression
- case: bool - Case-sensitive search
- main_group: int - Primary regex group for match extraction
- return_chars: bool - Include character objects in matches
- return_groups: bool - Include regex group information
- **kwargs: Additional search options
Returns:
List[Dict[str, Any]]: List of match objects with position and text data
"""Usage Examples:
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Simple text search
matches = page.search("invoice")
for match in matches:
print(f"Found '{match['text']}' at ({match['x0']}, {match['top']})")
# Regex search with groups
email_matches = page.search(r'(\w+)@(\w+\.\w+)', regex=True)
for match in email_matches:
print(f"Email: {match['text']}")
print(f"Groups: {match.get('groups', [])}")
# Case-insensitive search
ci_matches = page.search("TOTAL", case=False)
# Search with character details
detailed_matches = page.search("amount", return_chars=True)
for match in detailed_matches:
chars = match.get('chars', [])
print(f"Match uses {len(chars)} characters")Low-level character processing and deduplication functions.
def dedupe_chars(tolerance=1, use_text_flow=False, **kwargs):
"""
Remove duplicate characters.
Parameters:
- tolerance: int or float - Distance tolerance for duplicate detection
- use_text_flow: bool - Consider text flow in deduplication
- **kwargs: Additional deduplication options
Returns:
Page: New page object with deduplicated characters
"""Standalone text processing functions available in the utils module.
# From pdfplumber.utils
def extract_text(chars, **kwargs):
"""Extract text from character objects."""
def extract_text_simple(chars, **kwargs):
"""Simple text extraction from characters."""
def extract_words(chars, **kwargs):
"""Extract words from character objects."""
def dedupe_chars(chars, tolerance=1, **kwargs):
"""Remove duplicate characters from list."""
def chars_to_textmap(chars, **kwargs):
"""Convert characters to TextMap object."""
def collate_line(chars, **kwargs):
"""Collate characters into text line."""Text Processing Constants:
# Default tolerance values
DEFAULT_X_TOLERANCE = 3
DEFAULT_Y_TOLERANCE = 3
DEFAULT_X_DENSITY = 7.25
DEFAULT_Y_DENSITY = 13Advanced text mapping object for character-level text analysis.
class TextMap:
"""Character-level text mapping with position data."""
def __init__(self, chars, **kwargs):
"""Initialize TextMap from character objects."""
def as_list(self):
"""Convert to list representation."""
def as_string(self):
"""Convert to string representation."""Usage Examples:
from pdfplumber.utils import chars_to_textmap
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Create TextMap from page characters
textmap = chars_to_textmap(page.chars)
# Convert to different representations
text_list = textmap.as_list()
text_string = textmap.as_string()
print(f"TextMap contains {len(text_list)} text elements")
print(f"Combined text: {text_string}")Install with Tessl CLI
npx tessl i tessl/pypi-pdfplumber