Industrial-strength Natural Language Processing (NLP) in Python
—
Access to 70+ language-specific models and processing pipelines, each optimized for specific linguistic characteristics and writing systems. spaCy provides pre-trained models and blank language classes for custom training.
Functions for loading pre-trained models and creating blank language objects for custom training.
def load(name: str, vocab: Vocab = None, disable: List[str] = None,
exclude: List[str] = None, config: dict = None) -> Language:
"""
Load a spaCy model by name or path.
Args:
name: Model name (e.g., 'en_core_web_sm') or path
vocab: Optional vocabulary to use
disable: Pipeline components to disable
exclude: Pipeline components to exclude entirely
config: Config overrides
Returns:
Language object with loaded model
"""
def blank(name: str, vocab: Vocab = None, config: dict = None) -> Language:
"""
Create a blank Language object for a given language.
Args:
name: Language code (e.g., 'en', 'de', 'zh')
vocab: Optional vocabulary
config: Optional config overrides
Returns:
Blank Language object without trained models
"""
def info(model: str = None, markdown: bool = False, silent: bool = False) -> None:
"""
Display information about a model or spaCy installation.
Args:
model: Model name to get info for
markdown: Print in markdown format
silent: Don't print to stdout
"""Each supported language has a specialized Language subclass with language-specific tokenization rules, stop words, and linguistic features.
class English(Language):
"""English language processing pipeline."""
lang = "en"
class German(Language):
"""German language processing pipeline."""
lang = "de"
class French(Language):
"""French language processing pipeline."""
lang = "fr"
class Spanish(Language):
"""Spanish language processing pipeline."""
lang = "es"
class Italian(Language):
"""Italian language processing pipeline."""
lang = "it"
class Portuguese(Language):
"""Portuguese language processing pipeline."""
lang = "pt"
class Russian(Language):
"""Russian language processing pipeline."""
lang = "ru"
class Chinese(Language):
"""Chinese language processing pipeline with specialized tokenizer."""
lang = "zh"
class Japanese(Language):
"""Japanese language processing pipeline with specialized tokenizer."""
lang = "ja"
class Korean(Language):
"""Korean language processing pipeline."""
lang = "ko"
class Arabic(Language):
"""Arabic language processing pipeline."""
lang = "ar"
class Hindi(Language):
"""Hindi language processing pipeline."""
lang = "hi"All supported language codes and their corresponding Language classes:
Each language class has an associated Defaults class containing language-specific configuration.
class LanguageDefaults:
"""Language-specific configuration and defaults."""
# Tokenizer configuration
tokenizer_exceptions: dict
prefixes: List[str]
suffixes: List[str]
infixes: List[str]
token_match: Pattern
url_match: Pattern
# Stop words
stop_words: Set[str]
# Writing system info
writing_system: dict
# Lemmatizer and lookup tables
lemma_rules: dict
lemma_index: dict
lemma_exc: dict
# Morph rules
morph_rules: dict
# Tag map
tag_map: dict
# Syntax iterators (noun chunks, etc.)
syntax_iterators: dictspaCy provides pre-trained models in different sizes for many languages:
# English models
"en_core_web_sm" # Small English model
"en_core_web_md" # Medium English model with vectors
"en_core_web_lg" # Large English model with large vectors
"en_core_web_trf" # Transformer-based English model
# German models
"de_core_news_sm" # Small German model
"de_core_news_md" # Medium German model
"de_core_news_lg" # Large German model
# French models
"fr_core_news_sm" # Small French model
"fr_core_news_md" # Medium French model
"fr_core_news_lg" # Large French model
# Spanish models
"es_core_news_sm" # Small Spanish model
"es_core_news_md" # Medium Spanish model
"es_core_news_lg" # Large Spanish model
# Chinese models
"zh_core_web_sm" # Small Chinese model
"zh_core_web_md" # Medium Chinese model
"zh_core_web_lg" # Large Chinese model
# And models for: pt, it, nl, ru, ja, ko, ca, da, el, lt, mk, nb, pl, ro, xximport spacy
# Load pre-trained models
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")
nlp_fr = spacy.load("fr_core_news_sm")
# Load with specific configuration
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
# Load with config overrides
config = {"nlp": {"batch_size": 1000}}
nlp = spacy.load("en_core_web_sm", config=config)
# Process text with different models
text = "Hello world"
doc_en = nlp_en(text)
doc_de = nlp_de("Hallo Welt")
doc_fr = nlp_fr("Bonjour le monde")import spacy
# Create blank models for custom training
nlp_en = spacy.blank("en")
nlp_de = spacy.blank("de")
nlp_zh = spacy.blank("zh")
# Add components to blank model
nlp_en.add_pipe("tagger")
nlp_en.add_pipe("parser")
nlp_en.add_pipe("ner")
# Create with custom vocabulary
from spacy.vocab import Vocab
custom_vocab = Vocab()
nlp = spacy.blank("en", vocab=custom_vocab)
print(f"Language: {nlp.lang}")
print(f"Pipeline: {nlp.pipe_names}")import spacy
# Load multiple language models
models = {
"en": spacy.load("en_core_web_sm"),
"de": spacy.load("de_core_news_sm"),
"fr": spacy.load("fr_core_news_sm"),
"es": spacy.load("es_core_news_sm")
}
# Process texts in different languages
texts = {
"en": "Apple Inc. is an American technology company.",
"de": "Apple Inc. ist ein amerikanisches Technologieunternehmen.",
"fr": "Apple Inc. est une entreprise technologique américaine.",
"es": "Apple Inc. es una empresa tecnológica estadounidense."
}
for lang, text in texts.items():
doc = models[lang](text)
print(f"{lang.upper()}: {doc.text}")
for ent in doc.ents:
print(f" {ent.text} -> {ent.label_}")import spacy
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.lang.fr import French
# Detect and process based on language
def process_multilingual(text, detected_lang="en"):
"""Process text with appropriate language model."""
language_models = {
"en": "en_core_web_sm",
"de": "de_core_news_sm",
"fr": "fr_core_news_sm",
"es": "es_core_news_sm"
}
if detected_lang in language_models:
nlp = spacy.load(language_models[detected_lang])
return nlp(text)
else:
# Fallback to English
nlp = spacy.load("en_core_web_sm")
return nlp(text)
# Process texts
english_doc = process_multilingual("Hello world", "en")
german_doc = process_multilingual("Hallo Welt", "de")import spacy
# Load models with different capabilities
nlp_en = spacy.load("en_core_web_sm")
nlp_zh = spacy.load("zh_core_web_sm") # Chinese with specialized tokenizer
nlp_ja = spacy.load("ja_core_news_sm") # Japanese with specialized tokenizer
# English processing
doc_en = nlp_en("Apple Inc. is buying a startup for $1 billion.")
print("English tokens:")
for token in doc_en:
print(f" {token.text} ({token.pos_})")
# Chinese processing (no spaces between words)
doc_zh = nlp_zh("苹果公司正在收购一家初创公司")
print("\nChinese tokens:")
for token in doc_zh:
print(f" {token.text} ({token.pos_})")
# Japanese processing (mixed scripts)
doc_ja = nlp_ja("アップル社はスタートアップを買収している")
print("\nJapanese tokens:")
for token in doc_ja:
print(f" {token.text} ({token.pos_})")import spacy
from spacy.lang.en import English
# Extend existing language class
class CustomEnglish(English):
"""Custom English class with additional features."""
def __init__(self, vocab=None, **kwargs):
super().__init__(vocab, **kwargs)
# Add custom initialization
# Register custom language
@spacy.registry.languages("custom_en")
def create_custom_english():
return CustomEnglish()
# Use custom language
nlp = spacy.blank("custom_en")import spacy
# Load model and inspect metadata
nlp = spacy.load("en_core_web_sm")
# Model metadata
print("Model info:")
print(f" Language: {nlp.lang}")
print(f" Name: {nlp.meta['name']}")
print(f" Version: {nlp.meta['version']}")
print(f" Description: {nlp.meta['description']}")
print(f" Pipeline: {nlp.pipe_names}")
# Vocabulary info
print(f"\nVocabulary size: {len(nlp.vocab)}")
print(f"Vectors: {nlp.vocab.vectors.size}")
# Component info
for name, component in nlp.pipeline:
print(f"Component '{name}': {type(component)}")
# Display full model info
spacy.info("en_core_web_sm")import spacy
# Load model with specific components for performance
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) # Faster tokenization + tagging only
# Use smaller model for memory constraints
nlp_small = spacy.load("en_core_web_sm") # ~15MB
nlp_large = spacy.load("en_core_web_lg") # ~750MB
# Process with disabled components temporarily
nlp = spacy.load("en_core_web_sm")
with nlp.disable_pipes("parser", "ner"):
# Faster processing without parsing and NER
docs = list(nlp.pipe(texts))
# Batch processing for efficiency
texts = ["Text 1", "Text 2", "Text 3"]
docs = list(nlp.pipe(texts, batch_size=100))Install with Tessl CLI
npx tessl i tessl/pypi-spacy