0
# NLTK Integration
1
2
PyStow provides seamless integration with NLTK (Natural Language Toolkit) for managing linguistic data resources. This integration ensures that NLTK data is downloaded and stored in standardized locations that PyStow can manage.
3
4
## NLTK Data Management
5
6
### NLTK Resource Download
7
8
```python { .api }
9
def ensure_nltk(resource: str = "stopwords") -> tuple[Path, bool]:
10
"""Ensure NLTK data is downloaded in a standard way.
11
12
Args:
13
resource: Name of the resource to download, e.g., stopwords
14
15
Returns:
16
A pair of the NLTK cache directory and a boolean that says if download was successful
17
18
Note:
19
This function also appends the standard PyStow location for NLTK data to the
20
nltk.data.path list so any downstream users of NLTK will know how to find it
21
automatically.
22
"""
23
```
24
25
## Usage Examples
26
27
### Basic NLTK Data Download
28
29
```python
30
import pystow
31
import nltk
32
33
# Download NLTK stopwords data
34
nltk_path, success = pystow.ensure_nltk("stopwords")
35
36
if success:
37
print(f"NLTK data stored at: {nltk_path}")
38
39
# Use NLTK with the downloaded data
40
from nltk.corpus import stopwords
41
stop_words = set(stopwords.words('english'))
42
print(f"Loaded {len(stop_words)} English stopwords")
43
```
44
45
### Downloading Multiple NLTK Resources
46
47
```python
48
import pystow
49
import nltk
50
51
# Download various NLTK resources
52
nltk_resources = [
53
"stopwords",
54
"punkt",
55
"wordnet",
56
"averaged_perceptron_tagger",
57
"vader_lexicon"
58
]
59
60
downloaded_resources = {}
61
for resource in nltk_resources:
62
path, success = pystow.ensure_nltk(resource)
63
downloaded_resources[resource] = {"path": path, "success": success}
64
65
if success:
66
print(f"✓ Downloaded {resource}")
67
else:
68
print(f"✗ Failed to download {resource}")
69
70
# Use the downloaded resources
71
if downloaded_resources["punkt"]["success"]:
72
from nltk.tokenize import sent_tokenize, word_tokenize
73
74
text = "Hello world. This is a test sentence."
75
sentences = sent_tokenize(text)
76
words = word_tokenize(text)
77
78
print(f"Sentences: {sentences}")
79
print(f"Words: {words}")
80
```
81
82
### Text Processing Pipeline with NLTK
83
84
```python
85
import pystow
86
import nltk
87
from nltk.corpus import stopwords
88
from nltk.tokenize import word_tokenize
89
from nltk.stem import WordNetLemmatizer
90
91
def setup_nltk_resources():
92
"""Setup required NLTK resources"""
93
resources = ["stopwords", "punkt", "wordnet", "omw-1.4"]
94
95
for resource in resources:
96
path, success = pystow.ensure_nltk(resource)
97
if not success:
98
raise RuntimeError(f"Failed to download NLTK resource: {resource}")
99
100
print("All NLTK resources downloaded successfully")
101
102
def preprocess_text(text):
103
"""Preprocess text using NLTK"""
104
# Ensure NLTK resources are available
105
setup_nltk_resources()
106
107
# Tokenize
108
tokens = word_tokenize(text.lower())
109
110
# Remove stopwords
111
stop_words = set(stopwords.words('english'))
112
filtered_tokens = [token for token in tokens if token not in stop_words]
113
114
# Lemmatize
115
lemmatizer = WordNetLemmatizer()
116
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
117
118
return lemmatized_tokens
119
120
# Use the preprocessing pipeline
121
text = "The quick brown foxes are jumping over the lazy dogs."
122
processed_tokens = preprocess_text(text)
123
print(f"Processed tokens: {processed_tokens}")
124
```
125
126
### NLTK Data Management for Applications
127
128
```python
129
import pystow
130
import nltk
131
132
class NLTKManager:
133
"""Manage NLTK data downloads for an application"""
134
135
def __init__(self, app_name="nlp_app"):
136
self.app_name = app_name
137
self.required_resources = []
138
139
def add_resource(self, resource_name):
140
"""Add a required NLTK resource"""
141
self.required_resources.append(resource_name)
142
143
def setup_resources(self):
144
"""Download all required NLTK resources"""
145
results = {}
146
147
for resource in self.required_resources:
148
print(f"Downloading NLTK resource: {resource}")
149
path, success = pystow.ensure_nltk(resource)
150
results[resource] = {
151
"path": path,
152
"success": success
153
}
154
155
if success:
156
print(f"✓ {resource} downloaded to {path}")
157
else:
158
print(f"✗ Failed to download {resource}")
159
160
return results
161
162
def verify_resources(self):
163
"""Verify that all required resources are available"""
164
missing = []
165
166
for resource in self.required_resources:
167
try:
168
nltk.data.find(f"{resource}")
169
except LookupError:
170
missing.append(resource)
171
172
if missing:
173
print(f"Missing NLTK resources: {missing}")
174
return False
175
176
print("All NLTK resources are available")
177
return True
178
179
# Usage
180
nltk_manager = NLTKManager("sentiment_analyzer")
181
nltk_manager.add_resource("vader_lexicon")
182
nltk_manager.add_resource("punkt")
183
nltk_manager.add_resource("stopwords")
184
185
# Setup resources
186
download_results = nltk_manager.setup_resources()
187
188
# Verify setup
189
if nltk_manager.verify_resources():
190
# Proceed with NLTK operations
191
from nltk.sentiment import SentimentIntensityAnalyzer
192
193
analyzer = SentimentIntensityAnalyzer()
194
text = "PyStow makes managing NLTK data so much easier!"
195
196
scores = analyzer.polarity_scores(text)
197
print(f"Sentiment scores: {scores}")
198
```
199
200
### Error Handling and Fallbacks
201
202
```python
203
import pystow
204
import nltk
205
206
def safe_nltk_download(resource, max_retries=3):
207
"""Safely download NLTK resource with retries"""
208
209
for attempt in range(max_retries):
210
try:
211
path, success = pystow.ensure_nltk(resource)
212
213
if success:
214
print(f"Successfully downloaded {resource} on attempt {attempt + 1}")
215
return path, True
216
else:
217
print(f"Download failed for {resource} on attempt {attempt + 1}")
218
219
except Exception as e:
220
print(f"Error downloading {resource} on attempt {attempt + 1}: {e}")
221
222
if attempt < max_retries - 1:
223
print(f"Retrying download for {resource}...")
224
225
print(f"Failed to download {resource} after {max_retries} attempts")
226
return None, False
227
228
def setup_nltk_with_fallback():
229
"""Setup NLTK with fallback options"""
230
231
# Try to download preferred resources
232
preferred_resources = ["stopwords", "punkt", "wordnet"]
233
fallback_resources = ["stopwords"] # Minimal set
234
235
downloaded = []
236
failed = []
237
238
for resource in preferred_resources:
239
path, success = safe_nltk_download(resource)
240
if success:
241
downloaded.append(resource)
242
else:
243
failed.append(resource)
244
245
# If critical resources failed, try fallback
246
if not downloaded:
247
print("No resources downloaded, trying fallback...")
248
for resource in fallback_resources:
249
path, success = safe_nltk_download(resource)
250
if success:
251
downloaded.append(resource)
252
253
return downloaded, failed
254
255
# Use fallback setup
256
downloaded, failed = setup_nltk_with_fallback()
257
print(f"Downloaded: {downloaded}")
258
print(f"Failed: {failed}")
259
260
# Proceed with available resources
261
if "stopwords" in downloaded:
262
from nltk.corpus import stopwords
263
stop_words = stopwords.words('english')
264
print(f"Using {len(stop_words)} stopwords")
265
```
266
267
### Custom NLTK Data Locations
268
269
```python
270
import pystow
271
import nltk
272
import os
273
274
def setup_custom_nltk_location():
275
"""Setup NLTK with custom PyStow location"""
276
277
# Download NLTK data to PyStow managed location
278
nltk_path, success = pystow.ensure_nltk("stopwords")
279
280
if success:
281
# The NLTK path is automatically added to nltk.data.path
282
print(f"NLTK data path: {nltk_path}")
283
print(f"NLTK search paths: {nltk.data.path}")
284
285
# You can also manually configure additional paths
286
custom_module = pystow.module("custom_nltk")
287
custom_path = custom_module.join("data")
288
289
if custom_path not in nltk.data.path:
290
nltk.data.path.append(str(custom_path))
291
print(f"Added custom NLTK path: {custom_path}")
292
293
# Setup custom locations
294
setup_custom_nltk_location()
295
296
# Verify NLTK can find its data
297
try:
298
from nltk.corpus import stopwords
299
words = stopwords.words('english')
300
print(f"Successfully loaded {len(words)} stopwords")
301
except LookupError as e:
302
print(f"NLTK data not found: {e}")
303
```
304
305
## Integration Benefits
306
307
### Standardized Data Management
308
- **Consistent Locations**: NLTK data is stored in PyStow-managed directories
309
- **Cross-Platform**: Works consistently across different operating systems
310
- **Version Control**: PyStow's versioning system can be applied to NLTK data
311
312
### Simplified Deployment
313
- **Reproducible Environments**: NLTK data management is consistent across deployments
314
- **Containerization**: Easy to package NLTK data with applications
315
- **CI/CD Integration**: Reliable NLTK data setup in automated pipelines
316
317
### Configuration Integration
318
- **Environment Variables**: Use PyStow's configuration system for NLTK settings
319
- **Application Settings**: Integrate NLTK data management with app configuration
320
321
```python
322
import pystow
323
324
# Configure NLTK data location via PyStow config
325
nltk_data_path = pystow.get_config(
326
"nltk", "data_path",
327
default=None
328
)
329
330
if nltk_data_path:
331
import nltk
332
nltk.data.path.insert(0, nltk_data_path)
333
334
# Download with configuration
335
resource = pystow.get_config("nltk", "default_resource", default="stopwords")
336
path, success = pystow.ensure_nltk(resource)
337
```