The Apache PDFBox library is an open source Java tool for working with PDF documents.
npx @tessl/cli install tessl/maven-org-apache-pdfbox--pdfbox@3.0.0Apache PDFBox is a comprehensive Java library for programmatic manipulation of PDF documents. It provides capabilities for creating new PDFs, parsing existing documents, extracting content, rendering pages to images, and handling advanced features like forms, encryption, and digital signatures.
pom.xml:<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.5</version>
</dependency>import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import java.io.File;
import java.io.IOException;
// Load an existing PDF
PDDocument document = Loader.loadPDF(new File("example.pdf"));
// Create a new PDF
PDDocument newDocument = new PDDocument();
PDPage page = new PDPage(PDRectangle.A4);
newDocument.addPage(page);
// Add text to a page
PDPageContentStream contentStream = new PDPageContentStream(newDocument, page);
contentStream.beginText();
contentStream.setFont(PDType1Font.HELVETICA, 12);
contentStream.newLineAtOffset(100, 700);
contentStream.showText("Hello, PDFBox!");
contentStream.endText();
contentStream.close();
// Save and close
newDocument.save("output.pdf");
newDocument.close();
document.close();PDFBox is structured into several architectural layers:
Core document loading, creation, saving, and manipulation functionality. Essential for all PDF operations.
// Document loading
public static PDDocument loadPDF(File file) throws IOException;
public static PDDocument loadPDF(InputStream input) throws IOException;
public static PDDocument loadPDF(File file, String password) throws IOException;
// Document creation and manipulation
public void addPage(PDPage page);
public void removePage(int pageIndex);
public int getNumberOfPages();
public void save(File file) throws IOException;
public void close() throws IOException;Comprehensive text extraction capabilities with support for area-based extraction, text positioning, and formatting control.
public String getText(PDDocument document) throws IOException;
public void setStartPage(int startPage);
public void setEndPage(int endPage);
public void addRegion(String regionName, Rectangle2D rect);
public String getTextForRegion(String regionName);Convert PDF pages to images with control over resolution, color spaces, and rendering quality.
public BufferedImage renderImage(int pageIndex) throws IOException;
public BufferedImage renderImageWithDPI(int pageIndex, float dpi) throws IOException;
public BufferedImage renderImage(int pageIndex, float scale, ImageType imageType) throws IOException;Utilities for merging, splitting, and overlaying multiple PDF documents with flexible configuration options.
public void mergeDocuments(MemoryUsageSetting memUsageSetting) throws IOException;
public List<PDDocument> split() throws IOException;
public void overlay(Map<Integer, String> overlayGuide) throws IOException;Handle PDF forms including text fields, checkboxes, radio buttons, and form submission with full AcroForm support.
public PDAcroForm getAcroForm();
public List<PDField> getFields();
public void setValue(String value) throws IOException;
public String getValue();
public void flatten() throws IOException;PDF encryption, decryption, access permissions, and digital signatures for document security.
public void encrypt(AccessPermission ap, StandardProtectionPolicy spp) throws IOException;
public boolean isEncrypted();
public void addSignature(PDSignature signature) throws IOException;
public List<PDSignature> getSignatureDictionaries();Low-level content stream parsing and generation for advanced PDF content manipulation and custom rendering.
public void processPage(PDPage page) throws IOException;
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException;
public void processStream(PDContentStream contentStream, PDPage page, PDResources resources) throws IOException;Direct manipulation of PDF objects using the Carousel Object System for advanced use cases and custom PDF structure handling.
public COSBase getItem(COSName key);
public void setItem(COSName key, COSBase value);
public void add(COSBase object);
public COSBase get(int index);