The Apache PDFBox library is an open source Java tool for working with PDF documents.
—
Comprehensive text extraction capabilities for PDF documents, including whole-document text extraction, area-based extraction, and advanced text formatting options.
Extract text from entire documents or specific page ranges.
// Constructor in org.apache.pdfbox.text.PDFTextStripper
public PDFTextStripper();
// Main extraction method
public String getText(PDDocument document) throws IOException;
// Page range configuration
public void setStartPage(int startPage);
public int getStartPage();
public void setEndPage(int endPage);
public int getEndPage();Control text extraction formatting and layout preservation.
// Formatting configuration methods in PDFTextStripper
public void setSortByPosition(boolean sortByPosition);
public boolean getSortByPosition();
public void setLineSeparator(String separator);
public String getLineSeparator();
public void setWordSeparator(String separator);
public String getWordSeparator();
public void setAddMoreFormatting(boolean addMoreFormatting);
public boolean getAddMoreFormatting();
public void setSuppressDuplicateOverlappingText(boolean suppress);
public boolean getSuppressDuplicateOverlappingText();
public void setSpacingTolerance(float spacingTolerance);
public float getSpacingTolerance();
public void setAverageCharTolerance(float averageCharTolerance);
public float getAverageCharTolerance();Extract text from specific rectangular regions of PDF pages.
// Constructor in org.apache.pdfbox.text.PDFTextStripperByArea
public PDFTextStripperByArea() throws IOException;
// Region management
public void addRegion(String regionName, Rectangle2D rect);
public void removeRegion(String regionName);
public List<String> getRegions();
// Text extraction from regions
public void extractRegions(PDPage page) throws IOException;
public String getTextForRegion(String regionName);Customize text extraction behavior through method overrides.
// Protected methods in PDFTextStripper for customization
protected void processTextPosition(TextPosition text);
protected void writeString(String text) throws IOException;
protected void writeCharacters(TextPosition text) throws IOException;
protected void writeLineSeparator() throws IOException;
protected void writeWordSeparator() throws IOException;
protected void startPage(PDPage page) throws IOException;
protected void endPage(PDPage page) throws IOException;Access detailed text positioning and formatting information.
// Methods in org.apache.pdfbox.text.TextPosition
public String getUnicode();
public float getX();
public float getY();
public float getWidth();
public float getHeight();
public float getWidthOfSpace();
public float getFontSize();
public PDFont getFont();
public Matrix getTextMatrix();
public float getDir();
public int getRotation();PDDocument document = Loader.loadPDF(new File("document.pdf"));
PDFTextStripper stripper = new PDFTextStripper();
// Extract all text
String text = stripper.getText(document);
System.out.println(text);
// Extract text from specific pages
stripper.setStartPage(2);
stripper.setEndPage(4);
String pageRangeText = stripper.getText(document);
document.close();PDDocument document = Loader.loadPDF(new File("document.pdf"));
PDPage page = document.getPage(0);
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
// Define regions to extract text from
Rectangle2D headerRegion = new Rectangle2D.Float(50, 750, 500, 50);
Rectangle2D contentRegion = new Rectangle2D.Float(50, 100, 500, 600);
stripper.addRegion("header", headerRegion);
stripper.addRegion("content", contentRegion);
// Extract text from regions
stripper.extractRegions(page);
String headerText = stripper.getTextForRegion("header");
String contentText = stripper.getTextForRegion("content");
System.out.println("Header: " + headerText);
System.out.println("Content: " + contentText);
document.close();public class CustomTextStripper extends PDFTextStripper {
private StringBuilder customOutput = new StringBuilder();
public CustomTextStripper() throws IOException {
super();
}
@Override
protected void processTextPosition(TextPosition text) {
// Custom processing logic
if (text.getFontSize() > 12) {
customOutput.append("[LARGE] ");
}
customOutput.append(text.getUnicode());
}
@Override
protected void writeString(String text) throws IOException {
// Custom string writing logic
super.writeString(text.toUpperCase());
}
public String getCustomOutput() {
return customOutput.toString();
}
}
// Usage
PDDocument document = Loader.loadPDF(new File("document.pdf"));
CustomTextStripper stripper = new CustomTextStripper();
String result = stripper.getText(document);
String customResult = stripper.getCustomOutput();
document.close();PDDocument document = Loader.loadPDF(new File("document.pdf"));
PDFTextStripper stripper = new PDFTextStripper();
// Configure text extraction options
stripper.setSortByPosition(true);
stripper.setLineSeparator(System.lineSeparator());
stripper.setWordSeparator(" ");
stripper.setAddMoreFormatting(true);
stripper.setSuppressDuplicateOverlappingText(true);
// Fine-tune spacing tolerances
stripper.setSpacingTolerance(0.5f);
stripper.setAverageCharTolerance(0.3f);
String formattedText = stripper.getText(document);
document.close();Install with Tessl CLI
npx tessl i tessl/maven-org-apache-pdfbox--pdfbox