CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pystow

Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.

Pending
Overview
Eval results
Files

nltk-integration.mddocs/

NLTK Integration

PyStow provides seamless integration with NLTK (Natural Language Toolkit) for managing linguistic data resources. This integration ensures that NLTK data is downloaded and stored in standardized locations that PyStow can manage.

NLTK Data Management

NLTK Resource Download

def ensure_nltk(resource: str = "stopwords") -> tuple[Path, bool]:
    """Ensure NLTK data is downloaded in a standard way.
    
    Args:
        resource: Name of the resource to download, e.g., stopwords
    
    Returns:
        A pair of the NLTK cache directory and a boolean that says if download was successful
    
    Note:
        This function also appends the standard PyStow location for NLTK data to the
        nltk.data.path list so any downstream users of NLTK will know how to find it
        automatically.
    """

Usage Examples

Basic NLTK Data Download

import pystow
import nltk

# Download NLTK stopwords data
nltk_path, success = pystow.ensure_nltk("stopwords")

if success:
    print(f"NLTK data stored at: {nltk_path}")
    
    # Use NLTK with the downloaded data
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    print(f"Loaded {len(stop_words)} English stopwords")

Downloading Multiple NLTK Resources

import pystow
import nltk

# Download various NLTK resources
nltk_resources = [
    "stopwords",
    "punkt",
    "wordnet",
    "averaged_perceptron_tagger",
    "vader_lexicon"
]

downloaded_resources = {}
for resource in nltk_resources:
    path, success = pystow.ensure_nltk(resource)
    downloaded_resources[resource] = {"path": path, "success": success}
    
    if success:
        print(f"✓ Downloaded {resource}")
    else:
        print(f"✗ Failed to download {resource}")

# Use the downloaded resources
if downloaded_resources["punkt"]["success"]:
    from nltk.tokenize import sent_tokenize, word_tokenize
    
    text = "Hello world. This is a test sentence."
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    
    print(f"Sentences: {sentences}")
    print(f"Words: {words}")

Text Processing Pipeline with NLTK

import pystow
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def setup_nltk_resources():
    """Setup required NLTK resources"""
    resources = ["stopwords", "punkt", "wordnet", "omw-1.4"]
    
    for resource in resources:
        path, success = pystow.ensure_nltk(resource)
        if not success:
            raise RuntimeError(f"Failed to download NLTK resource: {resource}")
    
    print("All NLTK resources downloaded successfully")

def preprocess_text(text):
    """Preprocess text using NLTK"""
    # Ensure NLTK resources are available
    setup_nltk_resources()
    
    # Tokenize
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return lemmatized_tokens

# Use the preprocessing pipeline
text = "The quick brown foxes are jumping over the lazy dogs."
processed_tokens = preprocess_text(text)
print(f"Processed tokens: {processed_tokens}")

NLTK Data Management for Applications

import pystow
import nltk

class NLTKManager:
    """Manage NLTK data downloads for an application"""
    
    def __init__(self, app_name="nlp_app"):
        self.app_name = app_name
        self.required_resources = []
    
    def add_resource(self, resource_name):
        """Add a required NLTK resource"""
        self.required_resources.append(resource_name)
    
    def setup_resources(self):
        """Download all required NLTK resources"""
        results = {}
        
        for resource in self.required_resources:
            print(f"Downloading NLTK resource: {resource}")
            path, success = pystow.ensure_nltk(resource)
            results[resource] = {
                "path": path,
                "success": success
            }
            
            if success:
                print(f"✓ {resource} downloaded to {path}")
            else:
                print(f"✗ Failed to download {resource}")
        
        return results
    
    def verify_resources(self):
        """Verify that all required resources are available"""
        missing = []
        
        for resource in self.required_resources:
            try:
                nltk.data.find(f"{resource}")
            except LookupError:
                missing.append(resource)
        
        if missing:
            print(f"Missing NLTK resources: {missing}")
            return False
        
        print("All NLTK resources are available")
        return True

# Usage
nltk_manager = NLTKManager("sentiment_analyzer")
nltk_manager.add_resource("vader_lexicon")
nltk_manager.add_resource("punkt")
nltk_manager.add_resource("stopwords")

# Setup resources
download_results = nltk_manager.setup_resources()

# Verify setup
if nltk_manager.verify_resources():
    # Proceed with NLTK operations
    from nltk.sentiment import SentimentIntensityAnalyzer
    
    analyzer = SentimentIntensityAnalyzer()
    text = "PyStow makes managing NLTK data so much easier!"
    
    scores = analyzer.polarity_scores(text)
    print(f"Sentiment scores: {scores}")

Error Handling and Fallbacks

import pystow
import nltk

def safe_nltk_download(resource, max_retries=3):
    """Safely download NLTK resource with retries"""
    
    for attempt in range(max_retries):
        try:
            path, success = pystow.ensure_nltk(resource)
            
            if success:
                print(f"Successfully downloaded {resource} on attempt {attempt + 1}")
                return path, True
            else:
                print(f"Download failed for {resource} on attempt {attempt + 1}")
                
        except Exception as e:
            print(f"Error downloading {resource} on attempt {attempt + 1}: {e}")
        
        if attempt < max_retries - 1:
            print(f"Retrying download for {resource}...")
    
    print(f"Failed to download {resource} after {max_retries} attempts")
    return None, False

def setup_nltk_with_fallback():
    """Setup NLTK with fallback options"""
    
    # Try to download preferred resources
    preferred_resources = ["stopwords", "punkt", "wordnet"]
    fallback_resources = ["stopwords"]  # Minimal set
    
    downloaded = []
    failed = []
    
    for resource in preferred_resources:
        path, success = safe_nltk_download(resource)
        if success:
            downloaded.append(resource)
        else:
            failed.append(resource)
    
    # If critical resources failed, try fallback
    if not downloaded:
        print("No resources downloaded, trying fallback...")
        for resource in fallback_resources:
            path, success = safe_nltk_download(resource)
            if success:
                downloaded.append(resource)
    
    return downloaded, failed

# Use fallback setup
downloaded, failed = setup_nltk_with_fallback()
print(f"Downloaded: {downloaded}")
print(f"Failed: {failed}")

# Proceed with available resources
if "stopwords" in downloaded:
    from nltk.corpus import stopwords
    stop_words = stopwords.words('english')
    print(f"Using {len(stop_words)} stopwords")

Custom NLTK Data Locations

import pystow
import nltk
import os

def setup_custom_nltk_location():
    """Setup NLTK with custom PyStow location"""
    
    # Download NLTK data to PyStow managed location
    nltk_path, success = pystow.ensure_nltk("stopwords")
    
    if success:
        # The NLTK path is automatically added to nltk.data.path
        print(f"NLTK data path: {nltk_path}")
        print(f"NLTK search paths: {nltk.data.path}")
        
        # You can also manually configure additional paths
        custom_module = pystow.module("custom_nltk")
        custom_path = custom_module.join("data")
        
        if custom_path not in nltk.data.path:
            nltk.data.path.append(str(custom_path))
            print(f"Added custom NLTK path: {custom_path}")

# Setup custom locations
setup_custom_nltk_location()

# Verify NLTK can find its data
try:
    from nltk.corpus import stopwords
    words = stopwords.words('english')
    print(f"Successfully loaded {len(words)} stopwords")
except LookupError as e:
    print(f"NLTK data not found: {e}")

Integration Benefits

Standardized Data Management

  • Consistent Locations: NLTK data is stored in PyStow-managed directories
  • Cross-Platform: Works consistently across different operating systems
  • Version Control: PyStow's versioning system can be applied to NLTK data

Simplified Deployment

  • Reproducible Environments: NLTK data management is consistent across deployments
  • Containerization: Easy to package NLTK data with applications
  • CI/CD Integration: Reliable NLTK data setup in automated pipelines

Configuration Integration

  • Environment Variables: Use PyStow's configuration system for NLTK settings
  • Application Settings: Integrate NLTK data management with app configuration
import pystow

# Configure NLTK data location via PyStow config
nltk_data_path = pystow.get_config(
    "nltk", "data_path",
    default=None
)

if nltk_data_path:
    import nltk
    nltk.data.path.insert(0, nltk_data_path)

# Download with configuration
resource = pystow.get_config("nltk", "default_resource", default="stopwords")
path, success = pystow.ensure_nltk(resource)

Install with Tessl CLI

npx tessl i tessl/pypi-pystow

docs

archives.md

cloud-storage.md

configuration.md

data-formats.md

directory-management.md

file-operations.md

index.md

module-class.md

nltk-integration.md

web-scraping.md

tile.json