or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

content-stream-processing.mdcos-operations.mddocument-operations.mdindex.mdinteractive-forms.mdmulti-pdf-operations.mdrendering-graphics.mdsecurity-encryption.mdtext-operations.md

text-operations.mddocs/

0

# Text Operations

1

2

Comprehensive text extraction capabilities for PDF documents, including whole-document text extraction, area-based extraction, and advanced text formatting options.

3

4

## Text Extraction

5

6

Extract text from entire documents or specific page ranges.

7

8

```java { .api }

9

// Constructor in org.apache.pdfbox.text.PDFTextStripper

10

public PDFTextStripper();

11

12

// Main extraction method

13

public String getText(PDDocument document) throws IOException;

14

15

// Page range configuration

16

public void setStartPage(int startPage);

17

public int getStartPage();

18

public void setEndPage(int endPage);

19

public int getEndPage();

20

```

21

22

## Text Formatting Options

23

24

Control text extraction formatting and layout preservation.

25

26

```java { .api }

27

// Formatting configuration methods in PDFTextStripper

28

public void setSortByPosition(boolean sortByPosition);

29

public boolean getSortByPosition();

30

31

public void setLineSeparator(String separator);

32

public String getLineSeparator();

33

34

public void setWordSeparator(String separator);

35

public String getWordSeparator();

36

37

public void setAddMoreFormatting(boolean addMoreFormatting);

38

public boolean getAddMoreFormatting();

39

40

public void setSuppressDuplicateOverlappingText(boolean suppress);

41

public boolean getSuppressDuplicateOverlappingText();

42

43

public void setSpacingTolerance(float spacingTolerance);

44

public float getSpacingTolerance();

45

46

public void setAverageCharTolerance(float averageCharTolerance);

47

public float getAverageCharTolerance();

48

```

49

50

## Area-Based Text Extraction

51

52

Extract text from specific rectangular regions of PDF pages.

53

54

```java { .api }

55

// Constructor in org.apache.pdfbox.text.PDFTextStripperByArea

56

public PDFTextStripperByArea() throws IOException;

57

58

// Region management

59

public void addRegion(String regionName, Rectangle2D rect);

60

public void removeRegion(String regionName);

61

public List<String> getRegions();

62

63

// Text extraction from regions

64

public void extractRegions(PDPage page) throws IOException;

65

public String getTextForRegion(String regionName);

66

```

67

68

## Advanced Text Extraction

69

70

Customize text extraction behavior through method overrides.

71

72

```java { .api }

73

// Protected methods in PDFTextStripper for customization

74

protected void processTextPosition(TextPosition text);

75

protected void writeString(String text) throws IOException;

76

protected void writeCharacters(TextPosition text) throws IOException;

77

protected void writeLineSeparator() throws IOException;

78

protected void writeWordSeparator() throws IOException;

79

protected void startPage(PDPage page) throws IOException;

80

protected void endPage(PDPage page) throws IOException;

81

```

82

83

## Text Position Information

84

85

Access detailed text positioning and formatting information.

86

87

```java { .api }

88

// Methods in org.apache.pdfbox.text.TextPosition

89

public String getUnicode();

90

public float getX();

91

public float getY();

92

public float getWidth();

93

public float getHeight();

94

public float getWidthOfSpace();

95

public float getFontSize();

96

public PDFont getFont();

97

public Matrix getTextMatrix();

98

public float getDir();

99

public int getRotation();

100

```

101

102

## Usage Examples

103

104

### Basic Text Extraction

105

106

```java

107

PDDocument document = Loader.loadPDF(new File("document.pdf"));

108

PDFTextStripper stripper = new PDFTextStripper();

109

110

// Extract all text

111

String text = stripper.getText(document);

112

System.out.println(text);

113

114

// Extract text from specific pages

115

stripper.setStartPage(2);

116

stripper.setEndPage(4);

117

String pageRangeText = stripper.getText(document);

118

119

document.close();

120

```

121

122

### Area-Based Text Extraction

123

124

```java

125

PDDocument document = Loader.loadPDF(new File("document.pdf"));

126

PDPage page = document.getPage(0);

127

128

PDFTextStripperByArea stripper = new PDFTextStripperByArea();

129

130

// Define regions to extract text from

131

Rectangle2D headerRegion = new Rectangle2D.Float(50, 750, 500, 50);

132

Rectangle2D contentRegion = new Rectangle2D.Float(50, 100, 500, 600);

133

134

stripper.addRegion("header", headerRegion);

135

stripper.addRegion("content", contentRegion);

136

137

// Extract text from regions

138

stripper.extractRegions(page);

139

140

String headerText = stripper.getTextForRegion("header");

141

String contentText = stripper.getTextForRegion("content");

142

143

System.out.println("Header: " + headerText);

144

System.out.println("Content: " + contentText);

145

146

document.close();

147

```

148

149

### Custom Text Processing

150

151

```java

152

public class CustomTextStripper extends PDFTextStripper {

153

private StringBuilder customOutput = new StringBuilder();

154

155

public CustomTextStripper() throws IOException {

156

super();

157

}

158

159

@Override

160

protected void processTextPosition(TextPosition text) {

161

// Custom processing logic

162

if (text.getFontSize() > 12) {

163

customOutput.append("[LARGE] ");

164

}

165

customOutput.append(text.getUnicode());

166

}

167

168

@Override

169

protected void writeString(String text) throws IOException {

170

// Custom string writing logic

171

super.writeString(text.toUpperCase());

172

}

173

174

public String getCustomOutput() {

175

return customOutput.toString();

176

}

177

}

178

179

// Usage

180

PDDocument document = Loader.loadPDF(new File("document.pdf"));

181

CustomTextStripper stripper = new CustomTextStripper();

182

String result = stripper.getText(document);

183

String customResult = stripper.getCustomOutput();

184

document.close();

185

```

186

187

### Text Formatting Control

188

189

```java

190

PDDocument document = Loader.loadPDF(new File("document.pdf"));

191

PDFTextStripper stripper = new PDFTextStripper();

192

193

// Configure text extraction options

194

stripper.setSortByPosition(true);

195

stripper.setLineSeparator(System.lineSeparator());

196

stripper.setWordSeparator(" ");

197

stripper.setAddMoreFormatting(true);

198

stripper.setSuppressDuplicateOverlappingText(true);

199

200

// Fine-tune spacing tolerances

201

stripper.setSpacingTolerance(0.5f);

202

stripper.setAverageCharTolerance(0.3f);

203

204

String formattedText = stripper.getText(document);

205

document.close();

206

```