Tessl Tile for maven/org.apache.pdfbox/pdfbox@3.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

content-stream-processing.md cos-operations.md document-operations.md index.md interactive-forms.md multi-pdf-operations.md rendering-graphics.md security-encryption.md text-operations.md

text-operations.mddocs/

0
# Text Operations
1

2
Comprehensive text extraction capabilities for PDF documents, including whole-document text extraction, area-based extraction, and advanced text formatting options.
3

4
## Text Extraction
5

6
Extract text from entire documents or specific page ranges.
7

8
```java { .api }
9
// Constructor in org.apache.pdfbox.text.PDFTextStripper
10
public PDFTextStripper();
11

12
// Main extraction method
13
public String getText(PDDocument document) throws IOException;
14

15
// Page range configuration
16
public void setStartPage(int startPage);
17
public int getStartPage();
18
public void setEndPage(int endPage);
19
public int getEndPage();
20
```
21

22
## Text Formatting Options
23

24
Control text extraction formatting and layout preservation.
25

26
```java { .api }
27
// Formatting configuration methods in PDFTextStripper
28
public void setSortByPosition(boolean sortByPosition);
29
public boolean getSortByPosition();
30

31
public void setLineSeparator(String separator);
32
public String getLineSeparator();
33

34
public void setWordSeparator(String separator);
35
public String getWordSeparator();
36

37
public void setAddMoreFormatting(boolean addMoreFormatting);
38
public boolean getAddMoreFormatting();
39

40
public void setSuppressDuplicateOverlappingText(boolean suppress);
41
public boolean getSuppressDuplicateOverlappingText();
42

43
public void setSpacingTolerance(float spacingTolerance);
44
public float getSpacingTolerance();
45

46
public void setAverageCharTolerance(float averageCharTolerance);
47
public float getAverageCharTolerance();
48
```
49

50
## Area-Based Text Extraction
51

52
Extract text from specific rectangular regions of PDF pages.
53

54
```java { .api }
55
// Constructor in org.apache.pdfbox.text.PDFTextStripperByArea
56
public PDFTextStripperByArea() throws IOException;
57

58
// Region management
59
public void addRegion(String regionName, Rectangle2D rect);
60
public void removeRegion(String regionName);
61
public List<String> getRegions();
62

63
// Text extraction from regions
64
public void extractRegions(PDPage page) throws IOException;
65
public String getTextForRegion(String regionName);
66
```
67

68
## Advanced Text Extraction
69

70
Customize text extraction behavior through method overrides.
71

72
```java { .api }
73
// Protected methods in PDFTextStripper for customization
74
protected void processTextPosition(TextPosition text);
75
protected void writeString(String text) throws IOException;
76
protected void writeCharacters(TextPosition text) throws IOException;
77
protected void writeLineSeparator() throws IOException;
78
protected void writeWordSeparator() throws IOException;
79
protected void startPage(PDPage page) throws IOException;
80
protected void endPage(PDPage page) throws IOException;
81
```
82

83
## Text Position Information
84

85
Access detailed text positioning and formatting information.
86

87
```java { .api }
88
// Methods in org.apache.pdfbox.text.TextPosition
89
public String getUnicode();
90
public float getX();
91
public float getY();
92
public float getWidth();
93
public float getHeight();
94
public float getWidthOfSpace();
95
public float getFontSize();
96
public PDFont getFont();
97
public Matrix getTextMatrix();
98
public float getDir();
99
public int getRotation();
100
```
101

102
## Usage Examples
103

104
### Basic Text Extraction
105

106
```java
107
PDDocument document = Loader.loadPDF(new File("document.pdf"));
108
PDFTextStripper stripper = new PDFTextStripper();
109

110
// Extract all text
111
String text = stripper.getText(document);
112
System.out.println(text);
113

114
// Extract text from specific pages
115
stripper.setStartPage(2);
116
stripper.setEndPage(4);
117
String pageRangeText = stripper.getText(document);
118

119
document.close();
120
```
121

122
### Area-Based Text Extraction
123

124
```java
125
PDDocument document = Loader.loadPDF(new File("document.pdf"));
126
PDPage page = document.getPage(0);
127

128
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
129

130
// Define regions to extract text from
131
Rectangle2D headerRegion = new Rectangle2D.Float(50, 750, 500, 50);
132
Rectangle2D contentRegion = new Rectangle2D.Float(50, 100, 500, 600);
133

134
stripper.addRegion("header", headerRegion);
135
stripper.addRegion("content", contentRegion);
136

137
// Extract text from regions
138
stripper.extractRegions(page);
139

140
String headerText = stripper.getTextForRegion("header");
141
String contentText = stripper.getTextForRegion("content");
142

143
System.out.println("Header: " + headerText);
144
System.out.println("Content: " + contentText);
145

146
document.close();
147
```
148

149
### Custom Text Processing
150

151
```java
152
public class CustomTextStripper extends PDFTextStripper {
153
    private StringBuilder customOutput = new StringBuilder();
154
    
155
    public CustomTextStripper() throws IOException {
156
        super();
157
    }
158
    
159
    @Override
160
    protected void processTextPosition(TextPosition text) {
161
        // Custom processing logic
162
        if (text.getFontSize() > 12) {
163
            customOutput.append("[LARGE] ");
164
        }
165
        customOutput.append(text.getUnicode());
166
    }
167
    
168
    @Override
169
    protected void writeString(String text) throws IOException {
170
        // Custom string writing logic
171
        super.writeString(text.toUpperCase());
172
    }
173
    
174
    public String getCustomOutput() {
175
        return customOutput.toString();
176
    }
177
}
178

179
// Usage
180
PDDocument document = Loader.loadPDF(new File("document.pdf"));
181
CustomTextStripper stripper = new CustomTextStripper();
182
String result = stripper.getText(document);
183
String customResult = stripper.getCustomOutput();
184
document.close();
185
```
186

187
### Text Formatting Control
188

189
```java
190
PDDocument document = Loader.loadPDF(new File("document.pdf"));
191
PDFTextStripper stripper = new PDFTextStripper();
192

193
// Configure text extraction options
194
stripper.setSortByPosition(true);
195
stripper.setLineSeparator(System.lineSeparator());
196
stripper.setWordSeparator(" ");
197
stripper.setAddMoreFormatting(true);
198
stripper.setSuppressDuplicateOverlappingText(true);
199

200
// Fine-tune spacing tolerances
201
stripper.setSpacingTolerance(0.5f);
202
stripper.setAverageCharTolerance(0.3f);
203

204
String formattedText = stripper.getText(document);
205
document.close();
206
```

Version

Tile

Files

text-operations.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-operations.mddocs/