Tessl Tile for pypi/pystow@0.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

archives.md cloud-storage.md configuration.md data-formats.md directory-management.md file-operations.md index.md module-class.md nltk-integration.md web-scraping.md

nltk-integration.mddocs/

0
# NLTK Integration
1

2
PyStow provides seamless integration with NLTK (Natural Language Toolkit) for managing linguistic data resources. This integration ensures that NLTK data is downloaded and stored in standardized locations that PyStow can manage.
3

4
## NLTK Data Management
5

6
### NLTK Resource Download
7

8
```python { .api }
9
def ensure_nltk(resource: str = "stopwords") -> tuple[Path, bool]:
10
    """Ensure NLTK data is downloaded in a standard way.
11
    
12
    Args:
13
        resource: Name of the resource to download, e.g., stopwords
14
    
15
    Returns:
16
        A pair of the NLTK cache directory and a boolean that says if download was successful
17
    
18
    Note:
19
        This function also appends the standard PyStow location for NLTK data to the
20
        nltk.data.path list so any downstream users of NLTK will know how to find it
21
        automatically.
22
    """
23
```
24

25
## Usage Examples
26

27
### Basic NLTK Data Download
28

29
```python
30
import pystow
31
import nltk
32

33
# Download NLTK stopwords data
34
nltk_path, success = pystow.ensure_nltk("stopwords")
35

36
if success:
37
    print(f"NLTK data stored at: {nltk_path}")
38
    
39
    # Use NLTK with the downloaded data
40
    from nltk.corpus import stopwords
41
    stop_words = set(stopwords.words('english'))
42
    print(f"Loaded {len(stop_words)} English stopwords")
43
```
44

45
### Downloading Multiple NLTK Resources
46

47
```python
48
import pystow
49
import nltk
50

51
# Download various NLTK resources
52
nltk_resources = [
53
    "stopwords",
54
    "punkt",
55
    "wordnet",
56
    "averaged_perceptron_tagger",
57
    "vader_lexicon"
58
]
59

60
downloaded_resources = {}
61
for resource in nltk_resources:
62
    path, success = pystow.ensure_nltk(resource)
63
    downloaded_resources[resource] = {"path": path, "success": success}
64
    
65
    if success:
66
        print(f"✓ Downloaded {resource}")
67
    else:
68
        print(f"✗ Failed to download {resource}")
69

70
# Use the downloaded resources
71
if downloaded_resources["punkt"]["success"]:
72
    from nltk.tokenize import sent_tokenize, word_tokenize
73
    
74
    text = "Hello world. This is a test sentence."
75
    sentences = sent_tokenize(text)
76
    words = word_tokenize(text)
77
    
78
    print(f"Sentences: {sentences}")
79
    print(f"Words: {words}")
80
```
81

82
### Text Processing Pipeline with NLTK
83

84
```python
85
import pystow
86
import nltk
87
from nltk.corpus import stopwords
88
from nltk.tokenize import word_tokenize
89
from nltk.stem import WordNetLemmatizer
90

91
def setup_nltk_resources():
92
    """Setup required NLTK resources"""
93
    resources = ["stopwords", "punkt", "wordnet", "omw-1.4"]
94
    
95
    for resource in resources:
96
        path, success = pystow.ensure_nltk(resource)
97
        if not success:
98
            raise RuntimeError(f"Failed to download NLTK resource: {resource}")
99
    
100
    print("All NLTK resources downloaded successfully")
101

102
def preprocess_text(text):
103
    """Preprocess text using NLTK"""
104
    # Ensure NLTK resources are available
105
    setup_nltk_resources()
106
    
107
    # Tokenize
108
    tokens = word_tokenize(text.lower())
109
    
110
    # Remove stopwords
111
    stop_words = set(stopwords.words('english'))
112
    filtered_tokens = [token for token in tokens if token not in stop_words]
113
    
114
    # Lemmatize
115
    lemmatizer = WordNetLemmatizer()
116
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
117
    
118
    return lemmatized_tokens
119

120
# Use the preprocessing pipeline
121
text = "The quick brown foxes are jumping over the lazy dogs."
122
processed_tokens = preprocess_text(text)
123
print(f"Processed tokens: {processed_tokens}")
124
```
125

126
### NLTK Data Management for Applications
127

128
```python
129
import pystow
130
import nltk
131

132
class NLTKManager:
133
    """Manage NLTK data downloads for an application"""
134
    
135
    def __init__(self, app_name="nlp_app"):
136
        self.app_name = app_name
137
        self.required_resources = []
138
    
139
    def add_resource(self, resource_name):
140
        """Add a required NLTK resource"""
141
        self.required_resources.append(resource_name)
142
    
143
    def setup_resources(self):
144
        """Download all required NLTK resources"""
145
        results = {}
146
        
147
        for resource in self.required_resources:
148
            print(f"Downloading NLTK resource: {resource}")
149
            path, success = pystow.ensure_nltk(resource)
150
            results[resource] = {
151
                "path": path,
152
                "success": success
153
            }
154
            
155
            if success:
156
                print(f"✓ {resource} downloaded to {path}")
157
            else:
158
                print(f"✗ Failed to download {resource}")
159
        
160
        return results
161
    
162
    def verify_resources(self):
163
        """Verify that all required resources are available"""
164
        missing = []
165
        
166
        for resource in self.required_resources:
167
            try:
168
                nltk.data.find(f"{resource}")
169
            except LookupError:
170
                missing.append(resource)
171
        
172
        if missing:
173
            print(f"Missing NLTK resources: {missing}")
174
            return False
175
        
176
        print("All NLTK resources are available")
177
        return True
178

179
# Usage
180
nltk_manager = NLTKManager("sentiment_analyzer")
181
nltk_manager.add_resource("vader_lexicon")
182
nltk_manager.add_resource("punkt")
183
nltk_manager.add_resource("stopwords")
184

185
# Setup resources
186
download_results = nltk_manager.setup_resources()
187

188
# Verify setup
189
if nltk_manager.verify_resources():
190
    # Proceed with NLTK operations
191
    from nltk.sentiment import SentimentIntensityAnalyzer
192
    
193
    analyzer = SentimentIntensityAnalyzer()
194
    text = "PyStow makes managing NLTK data so much easier!"
195
    
196
    scores = analyzer.polarity_scores(text)
197
    print(f"Sentiment scores: {scores}")
198
```
199

200
### Error Handling and Fallbacks
201

202
```python
203
import pystow
204
import nltk
205

206
def safe_nltk_download(resource, max_retries=3):
207
    """Safely download NLTK resource with retries"""
208
    
209
    for attempt in range(max_retries):
210
        try:
211
            path, success = pystow.ensure_nltk(resource)
212
            
213
            if success:
214
                print(f"Successfully downloaded {resource} on attempt {attempt + 1}")
215
                return path, True
216
            else:
217
                print(f"Download failed for {resource} on attempt {attempt + 1}")
218
                
219
        except Exception as e:
220
            print(f"Error downloading {resource} on attempt {attempt + 1}: {e}")
221
        
222
        if attempt < max_retries - 1:
223
            print(f"Retrying download for {resource}...")
224
    
225
    print(f"Failed to download {resource} after {max_retries} attempts")
226
    return None, False
227

228
def setup_nltk_with_fallback():
229
    """Setup NLTK with fallback options"""
230
    
231
    # Try to download preferred resources
232
    preferred_resources = ["stopwords", "punkt", "wordnet"]
233
    fallback_resources = ["stopwords"]  # Minimal set
234
    
235
    downloaded = []
236
    failed = []
237
    
238
    for resource in preferred_resources:
239
        path, success = safe_nltk_download(resource)
240
        if success:
241
            downloaded.append(resource)
242
        else:
243
            failed.append(resource)
244
    
245
    # If critical resources failed, try fallback
246
    if not downloaded:
247
        print("No resources downloaded, trying fallback...")
248
        for resource in fallback_resources:
249
            path, success = safe_nltk_download(resource)
250
            if success:
251
                downloaded.append(resource)
252
    
253
    return downloaded, failed
254

255
# Use fallback setup
256
downloaded, failed = setup_nltk_with_fallback()
257
print(f"Downloaded: {downloaded}")
258
print(f"Failed: {failed}")
259

260
# Proceed with available resources
261
if "stopwords" in downloaded:
262
    from nltk.corpus import stopwords
263
    stop_words = stopwords.words('english')
264
    print(f"Using {len(stop_words)} stopwords")
265
```
266

267
### Custom NLTK Data Locations
268

269
```python
270
import pystow
271
import nltk
272
import os
273

274
def setup_custom_nltk_location():
275
    """Setup NLTK with custom PyStow location"""
276
    
277
    # Download NLTK data to PyStow managed location
278
    nltk_path, success = pystow.ensure_nltk("stopwords")
279
    
280
    if success:
281
        # The NLTK path is automatically added to nltk.data.path
282
        print(f"NLTK data path: {nltk_path}")
283
        print(f"NLTK search paths: {nltk.data.path}")
284
        
285
        # You can also manually configure additional paths
286
        custom_module = pystow.module("custom_nltk")
287
        custom_path = custom_module.join("data")
288
        
289
        if custom_path not in nltk.data.path:
290
            nltk.data.path.append(str(custom_path))
291
            print(f"Added custom NLTK path: {custom_path}")
292

293
# Setup custom locations
294
setup_custom_nltk_location()
295

296
# Verify NLTK can find its data
297
try:
298
    from nltk.corpus import stopwords
299
    words = stopwords.words('english')
300
    print(f"Successfully loaded {len(words)} stopwords")
301
except LookupError as e:
302
    print(f"NLTK data not found: {e}")
303
```
304

305
## Integration Benefits
306

307
### Standardized Data Management
308
- **Consistent Locations**: NLTK data is stored in PyStow-managed directories
309
- **Cross-Platform**: Works consistently across different operating systems
310
- **Version Control**: PyStow's versioning system can be applied to NLTK data
311

312
### Simplified Deployment
313
- **Reproducible Environments**: NLTK data management is consistent across deployments
314
- **Containerization**: Easy to package NLTK data with applications
315
- **CI/CD Integration**: Reliable NLTK data setup in automated pipelines
316

317
### Configuration Integration
318
- **Environment Variables**: Use PyStow's configuration system for NLTK settings
319
- **Application Settings**: Integrate NLTK data management with app configuration
320

321
```python
322
import pystow
323

324
# Configure NLTK data location via PyStow config
325
nltk_data_path = pystow.get_config(
326
    "nltk", "data_path",
327
    default=None
328
)
329

330
if nltk_data_path:
331
    import nltk
332
    nltk.data.path.insert(0, nltk_data_path)
333

334
# Download with configuration
335
resource = pystow.get_config("nltk", "default_resource", default="stopwords")
336
path, success = pystow.ensure_nltk(resource)
337
```

Version

Tile

Files

nltk-integration.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

nltk-integration.mddocs/