0
# Text Operations
1
2
Comprehensive text extraction capabilities for PDF documents, including whole-document text extraction, area-based extraction, and advanced text formatting options.
3
4
## Text Extraction
5
6
Extract text from entire documents or specific page ranges.
7
8
```java { .api }
9
// Constructor in org.apache.pdfbox.text.PDFTextStripper
10
public PDFTextStripper();
11
12
// Main extraction method
13
public String getText(PDDocument document) throws IOException;
14
15
// Page range configuration
16
public void setStartPage(int startPage);
17
public int getStartPage();
18
public void setEndPage(int endPage);
19
public int getEndPage();
20
```
21
22
## Text Formatting Options
23
24
Control text extraction formatting and layout preservation.
25
26
```java { .api }
27
// Formatting configuration methods in PDFTextStripper
28
public void setSortByPosition(boolean sortByPosition);
29
public boolean getSortByPosition();
30
31
public void setLineSeparator(String separator);
32
public String getLineSeparator();
33
34
public void setWordSeparator(String separator);
35
public String getWordSeparator();
36
37
public void setAddMoreFormatting(boolean addMoreFormatting);
38
public boolean getAddMoreFormatting();
39
40
public void setSuppressDuplicateOverlappingText(boolean suppress);
41
public boolean getSuppressDuplicateOverlappingText();
42
43
public void setSpacingTolerance(float spacingTolerance);
44
public float getSpacingTolerance();
45
46
public void setAverageCharTolerance(float averageCharTolerance);
47
public float getAverageCharTolerance();
48
```
49
50
## Area-Based Text Extraction
51
52
Extract text from specific rectangular regions of PDF pages.
53
54
```java { .api }
55
// Constructor in org.apache.pdfbox.text.PDFTextStripperByArea
56
public PDFTextStripperByArea() throws IOException;
57
58
// Region management
59
public void addRegion(String regionName, Rectangle2D rect);
60
public void removeRegion(String regionName);
61
public List<String> getRegions();
62
63
// Text extraction from regions
64
public void extractRegions(PDPage page) throws IOException;
65
public String getTextForRegion(String regionName);
66
```
67
68
## Advanced Text Extraction
69
70
Customize text extraction behavior through method overrides.
71
72
```java { .api }
73
// Protected methods in PDFTextStripper for customization
74
protected void processTextPosition(TextPosition text);
75
protected void writeString(String text) throws IOException;
76
protected void writeCharacters(TextPosition text) throws IOException;
77
protected void writeLineSeparator() throws IOException;
78
protected void writeWordSeparator() throws IOException;
79
protected void startPage(PDPage page) throws IOException;
80
protected void endPage(PDPage page) throws IOException;
81
```
82
83
## Text Position Information
84
85
Access detailed text positioning and formatting information.
86
87
```java { .api }
88
// Methods in org.apache.pdfbox.text.TextPosition
89
public String getUnicode();
90
public float getX();
91
public float getY();
92
public float getWidth();
93
public float getHeight();
94
public float getWidthOfSpace();
95
public float getFontSize();
96
public PDFont getFont();
97
public Matrix getTextMatrix();
98
public float getDir();
99
public int getRotation();
100
```
101
102
## Usage Examples
103
104
### Basic Text Extraction
105
106
```java
107
PDDocument document = Loader.loadPDF(new File("document.pdf"));
108
PDFTextStripper stripper = new PDFTextStripper();
109
110
// Extract all text
111
String text = stripper.getText(document);
112
System.out.println(text);
113
114
// Extract text from specific pages
115
stripper.setStartPage(2);
116
stripper.setEndPage(4);
117
String pageRangeText = stripper.getText(document);
118
119
document.close();
120
```
121
122
### Area-Based Text Extraction
123
124
```java
125
PDDocument document = Loader.loadPDF(new File("document.pdf"));
126
PDPage page = document.getPage(0);
127
128
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
129
130
// Define regions to extract text from
131
Rectangle2D headerRegion = new Rectangle2D.Float(50, 750, 500, 50);
132
Rectangle2D contentRegion = new Rectangle2D.Float(50, 100, 500, 600);
133
134
stripper.addRegion("header", headerRegion);
135
stripper.addRegion("content", contentRegion);
136
137
// Extract text from regions
138
stripper.extractRegions(page);
139
140
String headerText = stripper.getTextForRegion("header");
141
String contentText = stripper.getTextForRegion("content");
142
143
System.out.println("Header: " + headerText);
144
System.out.println("Content: " + contentText);
145
146
document.close();
147
```
148
149
### Custom Text Processing
150
151
```java
152
public class CustomTextStripper extends PDFTextStripper {
153
private StringBuilder customOutput = new StringBuilder();
154
155
public CustomTextStripper() throws IOException {
156
super();
157
}
158
159
@Override
160
protected void processTextPosition(TextPosition text) {
161
// Custom processing logic
162
if (text.getFontSize() > 12) {
163
customOutput.append("[LARGE] ");
164
}
165
customOutput.append(text.getUnicode());
166
}
167
168
@Override
169
protected void writeString(String text) throws IOException {
170
// Custom string writing logic
171
super.writeString(text.toUpperCase());
172
}
173
174
public String getCustomOutput() {
175
return customOutput.toString();
176
}
177
}
178
179
// Usage
180
PDDocument document = Loader.loadPDF(new File("document.pdf"));
181
CustomTextStripper stripper = new CustomTextStripper();
182
String result = stripper.getText(document);
183
String customResult = stripper.getCustomOutput();
184
document.close();
185
```
186
187
### Text Formatting Control
188
189
```java
190
PDDocument document = Loader.loadPDF(new File("document.pdf"));
191
PDFTextStripper stripper = new PDFTextStripper();
192
193
// Configure text extraction options
194
stripper.setSortByPosition(true);
195
stripper.setLineSeparator(System.lineSeparator());
196
stripper.setWordSeparator(" ");
197
stripper.setAddMoreFormatting(true);
198
stripper.setSuppressDuplicateOverlappingText(true);
199
200
// Fine-tune spacing tolerances
201
stripper.setSpacingTolerance(0.5f);
202
stripper.setAverageCharTolerance(0.3f);
203
204
String formattedText = stripper.getText(document);
205
document.close();
206
```