or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archives.mdcloud-storage.mdconfiguration.mddata-formats.mddirectory-management.mdfile-operations.mdindex.mdmodule-class.mdnltk-integration.mdweb-scraping.md

nltk-integration.mddocs/

0

# NLTK Integration

1

2

PyStow provides seamless integration with NLTK (Natural Language Toolkit) for managing linguistic data resources. This integration ensures that NLTK data is downloaded and stored in standardized locations that PyStow can manage.

3

4

## NLTK Data Management

5

6

### NLTK Resource Download

7

8

```python { .api }

9

def ensure_nltk(resource: str = "stopwords") -> tuple[Path, bool]:

10

"""Ensure NLTK data is downloaded in a standard way.

11

12

Args:

13

resource: Name of the resource to download, e.g., stopwords

14

15

Returns:

16

A pair of the NLTK cache directory and a boolean that says if download was successful

17

18

Note:

19

This function also appends the standard PyStow location for NLTK data to the

20

nltk.data.path list so any downstream users of NLTK will know how to find it

21

automatically.

22

"""

23

```

24

25

## Usage Examples

26

27

### Basic NLTK Data Download

28

29

```python

30

import pystow

31

import nltk

32

33

# Download NLTK stopwords data

34

nltk_path, success = pystow.ensure_nltk("stopwords")

35

36

if success:

37

print(f"NLTK data stored at: {nltk_path}")

38

39

# Use NLTK with the downloaded data

40

from nltk.corpus import stopwords

41

stop_words = set(stopwords.words('english'))

42

print(f"Loaded {len(stop_words)} English stopwords")

43

```

44

45

### Downloading Multiple NLTK Resources

46

47

```python

48

import pystow

49

import nltk

50

51

# Download various NLTK resources

52

nltk_resources = [

53

"stopwords",

54

"punkt",

55

"wordnet",

56

"averaged_perceptron_tagger",

57

"vader_lexicon"

58

]

59

60

downloaded_resources = {}

61

for resource in nltk_resources:

62

path, success = pystow.ensure_nltk(resource)

63

downloaded_resources[resource] = {"path": path, "success": success}

64

65

if success:

66

print(f"✓ Downloaded {resource}")

67

else:

68

print(f"✗ Failed to download {resource}")

69

70

# Use the downloaded resources

71

if downloaded_resources["punkt"]["success"]:

72

from nltk.tokenize import sent_tokenize, word_tokenize

73

74

text = "Hello world. This is a test sentence."

75

sentences = sent_tokenize(text)

76

words = word_tokenize(text)

77

78

print(f"Sentences: {sentences}")

79

print(f"Words: {words}")

80

```

81

82

### Text Processing Pipeline with NLTK

83

84

```python

85

import pystow

86

import nltk

87

from nltk.corpus import stopwords

88

from nltk.tokenize import word_tokenize

89

from nltk.stem import WordNetLemmatizer

90

91

def setup_nltk_resources():

92

"""Setup required NLTK resources"""

93

resources = ["stopwords", "punkt", "wordnet", "omw-1.4"]

94

95

for resource in resources:

96

path, success = pystow.ensure_nltk(resource)

97

if not success:

98

raise RuntimeError(f"Failed to download NLTK resource: {resource}")

99

100

print("All NLTK resources downloaded successfully")

101

102

def preprocess_text(text):

103

"""Preprocess text using NLTK"""

104

# Ensure NLTK resources are available

105

setup_nltk_resources()

106

107

# Tokenize

108

tokens = word_tokenize(text.lower())

109

110

# Remove stopwords

111

stop_words = set(stopwords.words('english'))

112

filtered_tokens = [token for token in tokens if token not in stop_words]

113

114

# Lemmatize

115

lemmatizer = WordNetLemmatizer()

116

lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

117

118

return lemmatized_tokens

119

120

# Use the preprocessing pipeline

121

text = "The quick brown foxes are jumping over the lazy dogs."

122

processed_tokens = preprocess_text(text)

123

print(f"Processed tokens: {processed_tokens}")

124

```

125

126

### NLTK Data Management for Applications

127

128

```python

129

import pystow

130

import nltk

131

132

class NLTKManager:

133

"""Manage NLTK data downloads for an application"""

134

135

def __init__(self, app_name="nlp_app"):

136

self.app_name = app_name

137

self.required_resources = []

138

139

def add_resource(self, resource_name):

140

"""Add a required NLTK resource"""

141

self.required_resources.append(resource_name)

142

143

def setup_resources(self):

144

"""Download all required NLTK resources"""

145

results = {}

146

147

for resource in self.required_resources:

148

print(f"Downloading NLTK resource: {resource}")

149

path, success = pystow.ensure_nltk(resource)

150

results[resource] = {

151

"path": path,

152

"success": success

153

}

154

155

if success:

156

print(f"✓ {resource} downloaded to {path}")

157

else:

158

print(f"✗ Failed to download {resource}")

159

160

return results

161

162

def verify_resources(self):

163

"""Verify that all required resources are available"""

164

missing = []

165

166

for resource in self.required_resources:

167

try:

168

nltk.data.find(f"{resource}")

169

except LookupError:

170

missing.append(resource)

171

172

if missing:

173

print(f"Missing NLTK resources: {missing}")

174

return False

175

176

print("All NLTK resources are available")

177

return True

178

179

# Usage

180

nltk_manager = NLTKManager("sentiment_analyzer")

181

nltk_manager.add_resource("vader_lexicon")

182

nltk_manager.add_resource("punkt")

183

nltk_manager.add_resource("stopwords")

184

185

# Setup resources

186

download_results = nltk_manager.setup_resources()

187

188

# Verify setup

189

if nltk_manager.verify_resources():

190

# Proceed with NLTK operations

191

from nltk.sentiment import SentimentIntensityAnalyzer

192

193

analyzer = SentimentIntensityAnalyzer()

194

text = "PyStow makes managing NLTK data so much easier!"

195

196

scores = analyzer.polarity_scores(text)

197

print(f"Sentiment scores: {scores}")

198

```

199

200

### Error Handling and Fallbacks

201

202

```python

203

import pystow

204

import nltk

205

206

def safe_nltk_download(resource, max_retries=3):

207

"""Safely download NLTK resource with retries"""

208

209

for attempt in range(max_retries):

210

try:

211

path, success = pystow.ensure_nltk(resource)

212

213

if success:

214

print(f"Successfully downloaded {resource} on attempt {attempt + 1}")

215

return path, True

216

else:

217

print(f"Download failed for {resource} on attempt {attempt + 1}")

218

219

except Exception as e:

220

print(f"Error downloading {resource} on attempt {attempt + 1}: {e}")

221

222

if attempt < max_retries - 1:

223

print(f"Retrying download for {resource}...")

224

225

print(f"Failed to download {resource} after {max_retries} attempts")

226

return None, False

227

228

def setup_nltk_with_fallback():

229

"""Setup NLTK with fallback options"""

230

231

# Try to download preferred resources

232

preferred_resources = ["stopwords", "punkt", "wordnet"]

233

fallback_resources = ["stopwords"] # Minimal set

234

235

downloaded = []

236

failed = []

237

238

for resource in preferred_resources:

239

path, success = safe_nltk_download(resource)

240

if success:

241

downloaded.append(resource)

242

else:

243

failed.append(resource)

244

245

# If critical resources failed, try fallback

246

if not downloaded:

247

print("No resources downloaded, trying fallback...")

248

for resource in fallback_resources:

249

path, success = safe_nltk_download(resource)

250

if success:

251

downloaded.append(resource)

252

253

return downloaded, failed

254

255

# Use fallback setup

256

downloaded, failed = setup_nltk_with_fallback()

257

print(f"Downloaded: {downloaded}")

258

print(f"Failed: {failed}")

259

260

# Proceed with available resources

261

if "stopwords" in downloaded:

262

from nltk.corpus import stopwords

263

stop_words = stopwords.words('english')

264

print(f"Using {len(stop_words)} stopwords")

265

```

266

267

### Custom NLTK Data Locations

268

269

```python

270

import pystow

271

import nltk

272

import os

273

274

def setup_custom_nltk_location():

275

"""Setup NLTK with custom PyStow location"""

276

277

# Download NLTK data to PyStow managed location

278

nltk_path, success = pystow.ensure_nltk("stopwords")

279

280

if success:

281

# The NLTK path is automatically added to nltk.data.path

282

print(f"NLTK data path: {nltk_path}")

283

print(f"NLTK search paths: {nltk.data.path}")

284

285

# You can also manually configure additional paths

286

custom_module = pystow.module("custom_nltk")

287

custom_path = custom_module.join("data")

288

289

if custom_path not in nltk.data.path:

290

nltk.data.path.append(str(custom_path))

291

print(f"Added custom NLTK path: {custom_path}")

292

293

# Setup custom locations

294

setup_custom_nltk_location()

295

296

# Verify NLTK can find its data

297

try:

298

from nltk.corpus import stopwords

299

words = stopwords.words('english')

300

print(f"Successfully loaded {len(words)} stopwords")

301

except LookupError as e:

302

print(f"NLTK data not found: {e}")

303

```

304

305

## Integration Benefits

306

307

### Standardized Data Management

308

- **Consistent Locations**: NLTK data is stored in PyStow-managed directories

309

- **Cross-Platform**: Works consistently across different operating systems

310

- **Version Control**: PyStow's versioning system can be applied to NLTK data

311

312

### Simplified Deployment

313

- **Reproducible Environments**: NLTK data management is consistent across deployments

314

- **Containerization**: Easy to package NLTK data with applications

315

- **CI/CD Integration**: Reliable NLTK data setup in automated pipelines

316

317

### Configuration Integration

318

- **Environment Variables**: Use PyStow's configuration system for NLTK settings

319

- **Application Settings**: Integrate NLTK data management with app configuration

320

321

```python

322

import pystow

323

324

# Configure NLTK data location via PyStow config

325

nltk_data_path = pystow.get_config(

326

"nltk", "data_path",

327

default=None

328

)

329

330

if nltk_data_path:

331

import nltk

332

nltk.data.path.insert(0, nltk_data_path)

333

334

# Download with configuration

335

resource = pystow.get_config("nltk", "default_resource", default="stopwords")

336

path, success = pystow.ensure_nltk(resource)

337

```