CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-pdfbox--pdfbox

The Apache PDFBox library is an open source Java tool for working with PDF documents.

Pending
Overview
Eval results
Files

text-operations.mddocs/

Text Operations

Comprehensive text extraction capabilities for PDF documents, including whole-document text extraction, area-based extraction, and advanced text formatting options.

Text Extraction

Extract text from entire documents or specific page ranges.

// Constructor in org.apache.pdfbox.text.PDFTextStripper
public PDFTextStripper();

// Main extraction method
public String getText(PDDocument document) throws IOException;

// Page range configuration
public void setStartPage(int startPage);
public int getStartPage();
public void setEndPage(int endPage);
public int getEndPage();

Text Formatting Options

Control text extraction formatting and layout preservation.

// Formatting configuration methods in PDFTextStripper
public void setSortByPosition(boolean sortByPosition);
public boolean getSortByPosition();

public void setLineSeparator(String separator);
public String getLineSeparator();

public void setWordSeparator(String separator);
public String getWordSeparator();

public void setAddMoreFormatting(boolean addMoreFormatting);
public boolean getAddMoreFormatting();

public void setSuppressDuplicateOverlappingText(boolean suppress);
public boolean getSuppressDuplicateOverlappingText();

public void setSpacingTolerance(float spacingTolerance);
public float getSpacingTolerance();

public void setAverageCharTolerance(float averageCharTolerance);
public float getAverageCharTolerance();

Area-Based Text Extraction

Extract text from specific rectangular regions of PDF pages.

// Constructor in org.apache.pdfbox.text.PDFTextStripperByArea
public PDFTextStripperByArea() throws IOException;

// Region management
public void addRegion(String regionName, Rectangle2D rect);
public void removeRegion(String regionName);
public List<String> getRegions();

// Text extraction from regions
public void extractRegions(PDPage page) throws IOException;
public String getTextForRegion(String regionName);

Advanced Text Extraction

Customize text extraction behavior through method overrides.

// Protected methods in PDFTextStripper for customization
protected void processTextPosition(TextPosition text);
protected void writeString(String text) throws IOException;
protected void writeCharacters(TextPosition text) throws IOException;
protected void writeLineSeparator() throws IOException;
protected void writeWordSeparator() throws IOException;
protected void startPage(PDPage page) throws IOException;
protected void endPage(PDPage page) throws IOException;

Text Position Information

Access detailed text positioning and formatting information.

// Methods in org.apache.pdfbox.text.TextPosition
public String getUnicode();
public float getX();
public float getY();
public float getWidth();
public float getHeight();
public float getWidthOfSpace();
public float getFontSize();
public PDFont getFont();
public Matrix getTextMatrix();
public float getDir();
public int getRotation();

Usage Examples

Basic Text Extraction

PDDocument document = Loader.loadPDF(new File("document.pdf"));
PDFTextStripper stripper = new PDFTextStripper();

// Extract all text
String text = stripper.getText(document);
System.out.println(text);

// Extract text from specific pages
stripper.setStartPage(2);
stripper.setEndPage(4);
String pageRangeText = stripper.getText(document);

document.close();

Area-Based Text Extraction

PDDocument document = Loader.loadPDF(new File("document.pdf"));
PDPage page = document.getPage(0);

PDFTextStripperByArea stripper = new PDFTextStripperByArea();

// Define regions to extract text from
Rectangle2D headerRegion = new Rectangle2D.Float(50, 750, 500, 50);
Rectangle2D contentRegion = new Rectangle2D.Float(50, 100, 500, 600);

stripper.addRegion("header", headerRegion);
stripper.addRegion("content", contentRegion);

// Extract text from regions
stripper.extractRegions(page);

String headerText = stripper.getTextForRegion("header");
String contentText = stripper.getTextForRegion("content");

System.out.println("Header: " + headerText);
System.out.println("Content: " + contentText);

document.close();

Custom Text Processing

public class CustomTextStripper extends PDFTextStripper {
    private StringBuilder customOutput = new StringBuilder();
    
    public CustomTextStripper() throws IOException {
        super();
    }
    
    @Override
    protected void processTextPosition(TextPosition text) {
        // Custom processing logic
        if (text.getFontSize() > 12) {
            customOutput.append("[LARGE] ");
        }
        customOutput.append(text.getUnicode());
    }
    
    @Override
    protected void writeString(String text) throws IOException {
        // Custom string writing logic
        super.writeString(text.toUpperCase());
    }
    
    public String getCustomOutput() {
        return customOutput.toString();
    }
}

// Usage
PDDocument document = Loader.loadPDF(new File("document.pdf"));
CustomTextStripper stripper = new CustomTextStripper();
String result = stripper.getText(document);
String customResult = stripper.getCustomOutput();
document.close();

Text Formatting Control

PDDocument document = Loader.loadPDF(new File("document.pdf"));
PDFTextStripper stripper = new PDFTextStripper();

// Configure text extraction options
stripper.setSortByPosition(true);
stripper.setLineSeparator(System.lineSeparator());
stripper.setWordSeparator(" ");
stripper.setAddMoreFormatting(true);
stripper.setSuppressDuplicateOverlappingText(true);

// Fine-tune spacing tolerances
stripper.setSpacingTolerance(0.5f);
stripper.setAverageCharTolerance(0.3f);

String formattedText = stripper.getText(document);
document.close();

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-pdfbox--pdfbox

docs

content-stream-processing.md

cos-operations.md

document-operations.md

index.md

interactive-forms.md

multi-pdf-operations.md

rendering-graphics.md

security-encryption.md

text-operations.md

tile.json