Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.
—
PyStow provides seamless integration with NLTK (Natural Language Toolkit) for managing linguistic data resources. This integration ensures that NLTK data is downloaded and stored in standardized locations that PyStow can manage.
def ensure_nltk(resource: str = "stopwords") -> tuple[Path, bool]:
"""Ensure NLTK data is downloaded in a standard way.
Args:
resource: Name of the resource to download, e.g., stopwords
Returns:
A pair of the NLTK cache directory and a boolean that says if download was successful
Note:
This function also appends the standard PyStow location for NLTK data to the
nltk.data.path list so any downstream users of NLTK will know how to find it
automatically.
"""import pystow
import nltk
# Download NLTK stopwords data
nltk_path, success = pystow.ensure_nltk("stopwords")
if success:
print(f"NLTK data stored at: {nltk_path}")
# Use NLTK with the downloaded data
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(f"Loaded {len(stop_words)} English stopwords")import pystow
import nltk
# Download various NLTK resources
nltk_resources = [
"stopwords",
"punkt",
"wordnet",
"averaged_perceptron_tagger",
"vader_lexicon"
]
downloaded_resources = {}
for resource in nltk_resources:
path, success = pystow.ensure_nltk(resource)
downloaded_resources[resource] = {"path": path, "success": success}
if success:
print(f"✓ Downloaded {resource}")
else:
print(f"✗ Failed to download {resource}")
# Use the downloaded resources
if downloaded_resources["punkt"]["success"]:
from nltk.tokenize import sent_tokenize, word_tokenize
text = "Hello world. This is a test sentence."
sentences = sent_tokenize(text)
words = word_tokenize(text)
print(f"Sentences: {sentences}")
print(f"Words: {words}")import pystow
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
def setup_nltk_resources():
"""Setup required NLTK resources"""
resources = ["stopwords", "punkt", "wordnet", "omw-1.4"]
for resource in resources:
path, success = pystow.ensure_nltk(resource)
if not success:
raise RuntimeError(f"Failed to download NLTK resource: {resource}")
print("All NLTK resources downloaded successfully")
def preprocess_text(text):
"""Preprocess text using NLTK"""
# Ensure NLTK resources are available
setup_nltk_resources()
# Tokenize
tokens = word_tokenize(text.lower())
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token not in stop_words]
# Lemmatize
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
return lemmatized_tokens
# Use the preprocessing pipeline
text = "The quick brown foxes are jumping over the lazy dogs."
processed_tokens = preprocess_text(text)
print(f"Processed tokens: {processed_tokens}")import pystow
import nltk
class NLTKManager:
"""Manage NLTK data downloads for an application"""
def __init__(self, app_name="nlp_app"):
self.app_name = app_name
self.required_resources = []
def add_resource(self, resource_name):
"""Add a required NLTK resource"""
self.required_resources.append(resource_name)
def setup_resources(self):
"""Download all required NLTK resources"""
results = {}
for resource in self.required_resources:
print(f"Downloading NLTK resource: {resource}")
path, success = pystow.ensure_nltk(resource)
results[resource] = {
"path": path,
"success": success
}
if success:
print(f"✓ {resource} downloaded to {path}")
else:
print(f"✗ Failed to download {resource}")
return results
def verify_resources(self):
"""Verify that all required resources are available"""
missing = []
for resource in self.required_resources:
try:
nltk.data.find(f"{resource}")
except LookupError:
missing.append(resource)
if missing:
print(f"Missing NLTK resources: {missing}")
return False
print("All NLTK resources are available")
return True
# Usage
nltk_manager = NLTKManager("sentiment_analyzer")
nltk_manager.add_resource("vader_lexicon")
nltk_manager.add_resource("punkt")
nltk_manager.add_resource("stopwords")
# Setup resources
download_results = nltk_manager.setup_resources()
# Verify setup
if nltk_manager.verify_resources():
# Proceed with NLTK operations
from nltk.sentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
text = "PyStow makes managing NLTK data so much easier!"
scores = analyzer.polarity_scores(text)
print(f"Sentiment scores: {scores}")import pystow
import nltk
def safe_nltk_download(resource, max_retries=3):
"""Safely download NLTK resource with retries"""
for attempt in range(max_retries):
try:
path, success = pystow.ensure_nltk(resource)
if success:
print(f"Successfully downloaded {resource} on attempt {attempt + 1}")
return path, True
else:
print(f"Download failed for {resource} on attempt {attempt + 1}")
except Exception as e:
print(f"Error downloading {resource} on attempt {attempt + 1}: {e}")
if attempt < max_retries - 1:
print(f"Retrying download for {resource}...")
print(f"Failed to download {resource} after {max_retries} attempts")
return None, False
def setup_nltk_with_fallback():
"""Setup NLTK with fallback options"""
# Try to download preferred resources
preferred_resources = ["stopwords", "punkt", "wordnet"]
fallback_resources = ["stopwords"] # Minimal set
downloaded = []
failed = []
for resource in preferred_resources:
path, success = safe_nltk_download(resource)
if success:
downloaded.append(resource)
else:
failed.append(resource)
# If critical resources failed, try fallback
if not downloaded:
print("No resources downloaded, trying fallback...")
for resource in fallback_resources:
path, success = safe_nltk_download(resource)
if success:
downloaded.append(resource)
return downloaded, failed
# Use fallback setup
downloaded, failed = setup_nltk_with_fallback()
print(f"Downloaded: {downloaded}")
print(f"Failed: {failed}")
# Proceed with available resources
if "stopwords" in downloaded:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(f"Using {len(stop_words)} stopwords")import pystow
import nltk
import os
def setup_custom_nltk_location():
"""Setup NLTK with custom PyStow location"""
# Download NLTK data to PyStow managed location
nltk_path, success = pystow.ensure_nltk("stopwords")
if success:
# The NLTK path is automatically added to nltk.data.path
print(f"NLTK data path: {nltk_path}")
print(f"NLTK search paths: {nltk.data.path}")
# You can also manually configure additional paths
custom_module = pystow.module("custom_nltk")
custom_path = custom_module.join("data")
if custom_path not in nltk.data.path:
nltk.data.path.append(str(custom_path))
print(f"Added custom NLTK path: {custom_path}")
# Setup custom locations
setup_custom_nltk_location()
# Verify NLTK can find its data
try:
from nltk.corpus import stopwords
words = stopwords.words('english')
print(f"Successfully loaded {len(words)} stopwords")
except LookupError as e:
print(f"NLTK data not found: {e}")import pystow
# Configure NLTK data location via PyStow config
nltk_data_path = pystow.get_config(
"nltk", "data_path",
default=None
)
if nltk_data_path:
import nltk
nltk.data.path.insert(0, nltk_data_path)
# Download with configuration
resource = pystow.get_config("nltk", "default_resource", default="stopwords")
path, success = pystow.ensure_nltk(resource)Install with Tessl CLI
npx tessl i tessl/pypi-pystow