Tessl Tile for pypi/spacy@2.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-objects.md index.md language-models.md pattern-matching.md pipeline-components.md training.md visualization.md

pipeline-components.mddocs/

0
# Pipeline Components
1

2
Built-in pipeline components that perform linguistic analysis on documents. These components can be combined in customizable processing pipelines to add part-of-speech tags, dependency parsing, named entity recognition, text classification, and more.
3

4
## Capabilities
5

6
### Part-of-Speech Tagging
7

8
Statistical models that assign part-of-speech tags and morphological features to tokens based on context and linguistic patterns.
9

10
```python { .api }
11
class Tagger:
12
    """Part-of-speech tagger pipeline component."""
13
    
14
    name: str = "tagger"
15
    
16
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
17
        """Initialize the tagger."""
18
    
19
    def __call__(self, doc: Doc) -> Doc:
20
        """Apply the tagger to a Doc object."""
21
    
22
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
23
        """Process documents in batches."""
24
    
25
    def predict(self, docs: List[Doc]) -> Scores:
26
        """Predict part-of-speech tags for documents."""
27
    
28
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
29
        """Set part-of-speech annotations on documents."""
30
    
31
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
32
        """Update the model with training examples."""
33
    
34
    def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:
35
        """Initialize training."""
36
    
37
    def add_label(self, label: str) -> int:
38
        """Add a label to the component."""
39
    
40
    # Serialization
41
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
42
        """Save the component to disk."""
43
    
44
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Tagger':
45
        """Load the component from disk."""
46
```
47

48
### Dependency Parsing
49

50
Statistical parser that predicts syntactic dependencies between tokens, creating a dependency tree structure.
51

52
```python { .api }
53
class DependencyParser:
54
    """Dependency parser pipeline component."""
55
    
56
    name: str = "parser"
57
    
58
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
59
        """Initialize the parser."""
60
    
61
    def __call__(self, doc: Doc) -> Doc:
62
        """Apply the parser to a Doc object."""
63
    
64
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
65
        """Process documents in batches."""
66
    
67
    def predict(self, docs: List[Doc]) -> Scores:
68
        """Predict dependency relations for documents."""
69
    
70
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
71
        """Set dependency annotations on documents."""
72
    
73
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
74
        """Update the model with training examples."""
75
    
76
    def add_label(self, label: str) -> int:
77
        """Add a dependency label."""
78
    
79
    # Serialization methods similar to Tagger
80
```
81

82
### Named Entity Recognition
83

84
Statistical model that identifies and classifies named entities (people, organizations, locations, etc.) in text.
85

86
```python { .api }
87
class EntityRecognizer:
88
    """Named entity recognition pipeline component."""
89
    
90
    name: str = "ner" 
91
    
92
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
93
        """Initialize the NER component."""
94
    
95
    def __call__(self, doc: Doc) -> Doc:
96
        """Apply NER to a Doc object."""
97
    
98
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
99
        """Process documents in batches."""
100
    
101
    def predict(self, docs: List[Doc]) -> Scores:
102
        """Predict named entities for documents."""
103
    
104
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
105
        """Set named entity annotations on documents."""
106
    
107
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
108
        """Update the model with training examples."""
109
    
110
    def add_label(self, label: str) -> int:
111
        """Add an entity label."""
112
    
113
    # Serialization methods similar to Tagger
114
```
115

116
### Text Classification
117

118
Multi-label text classifier that assigns category scores to documents based on their content.
119

120
```python { .api }
121
class TextCategorizer:
122
    """Text classification pipeline component."""
123
    
124
    name: str = "textcat"
125
    
126
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
127
        """Initialize the text categorizer."""
128
    
129
    def __call__(self, doc: Doc) -> Doc:
130
        """Apply text categorization to a Doc object."""
131
    
132
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
133
        """Process documents in batches."""
134
    
135
    def predict(self, docs: List[Doc]) -> Scores:
136
        """Predict category scores for documents."""
137
    
138
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
139
        """Set category annotations on documents."""
140
    
141
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
142
        """Update the model with training examples."""
143
    
144
    def add_label(self, label: str) -> int:
145
        """Add a category label."""
146
    
147
    @property
148
    def labels(self) -> tuple:
149
        """Get category labels."""
150
```
151

152
### Entity Linking
153

154
Component that links named entities to entries in a knowledge base using entity embeddings and candidate ranking.
155

156
```python { .api }
157
class EntityLinker:
158
    """Entity linking pipeline component."""
159
    
160
    name: str = "entity_linker"
161
    
162
    def __init__(self, vocab: Vocab, **cfg) -> None:
163
        """Initialize the entity linker."""
164
    
165
    def __call__(self, doc: Doc) -> Doc:
166
        """Apply entity linking to a Doc object."""
167
    
168
    def predict(self, docs: List[Doc]) -> Scores:
169
        """Predict entity links for documents."""
170
    
171
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
172
        """Set entity linking annotations on documents."""
173
    
174
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
175
        """Update the model with training examples."""
176
    
177
    def add_label(self, label: str) -> int:
178
        """Add an entity type label."""
179
    
180
    def get_candidates(self, mention: Span) -> List:
181
        """Get knowledge base candidates for a mention."""
182
```
183

184
### Morphological Analysis
185

186
Component that analyzes word morphology and assigns detailed morphological features to tokens.
187

188
```python { .api }
189
class Morphologizer:
190
    """Morphological analysis pipeline component."""
191
    
192
    name: str = "morphologizer"
193
    
194
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
195
        """Initialize the morphologizer."""
196
    
197
    def __call__(self, doc: Doc) -> Doc:
198
        """Apply morphological analysis to a Doc object."""
199
    
200
    def predict(self, docs: List[Doc]) -> Scores:
201
        """Predict morphological features for documents."""
202
    
203
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
204
        """Set morphological annotations on documents."""
205
    
206
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
207
        """Update the model with training examples."""
208
```
209

210
### Rule-Based Components
211

212
#### Entity Ruler
213

214
Rule-based component for pattern-based named entity recognition using token patterns or phrase matching.
215

216
```python { .api }
217
class EntityRuler:
218
    """Rule-based named entity recognition component."""
219
    
220
    name: str = "entity_ruler"  
221
    
222
    def __init__(self, nlp: Language, patterns: List[dict] = None,
223
                 overwrite_ents: bool = False, **cfg) -> None:
224
        """Initialize the entity ruler."""
225
    
226
    def __call__(self, doc: Doc) -> Doc:
227
        """Apply entity rules to a Doc object."""
228
    
229
    def add_patterns(self, patterns: List[dict]) -> None:
230
        """Add patterns to the entity ruler."""
231
    
232
    @property
233
    def patterns(self) -> List[dict]:
234
        """Get all patterns."""
235
    
236
    @property
237
    def labels(self) -> set:
238
        """Get entity labels."""
239
    
240
    # Serialization
241
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
242
        """Save patterns to disk."""
243
    
244
    def from_disk(self, path: str, exclude: List[str] = None) -> 'EntityRuler':
245
        """Load patterns from disk."""
246
```
247

248
#### Sentence Boundary Detection
249

250
Fast, rule-based sentence boundary detection for most languages.
251

252
```python { .api }
253
class Sentencizer:
254
    """Rule-based sentence boundary detection component."""
255
    
256
    name: str = "sentencizer"
257
    
258
    def __init__(self, punct_chars: Set[str] = None, **cfg) -> None:
259
        """Initialize the sentencizer."""
260
    
261
    def __call__(self, doc: Doc) -> Doc:
262
        """Apply sentence boundary detection to a Doc object."""
263
    
264
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
265
        """Process documents in batches."""
266
```
267

268
### Pipeline Management Functions
269

270
Functions for merging tokens based on linguistic analysis.
271

272
```python { .api }
273
def merge_entities(doc: Doc) -> Doc:
274
    """
275
    Merge named entity tokens into single tokens.
276
    
277
    Args:
278
        doc: The Doc object to modify
279
        
280
    Returns:
281
        The modified Doc object
282
    """
283

284
def merge_noun_chunks(doc: Doc) -> Doc:
285
    """
286
    Merge noun chunk tokens into single tokens.
287
    
288
    Args:
289
        doc: The Doc object to modify
290
        
291
    Returns:
292
        The modified Doc object
293
    """
294

295
def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
296
    """
297
    Merge subtokens into single tokens.
298
    
299
    Args:
300
        doc: The Doc object to modify
301
        label: Label for merged tokens
302
        
303
    Returns:
304
        The modified Doc object
305
    """
306
```
307

308
### Base Pipeline Component
309

310
Abstract base class for creating custom pipeline components.
311

312
```python { .api }
313
class Pipe:
314
    """Base class for pipeline components."""
315
    
316
    name: str
317
    
318
    def __call__(self, doc: Doc) -> Doc:
319
        """Apply the component to a Doc object."""
320
        raise NotImplementedError
321
    
322
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
323
        """Process documents in batches."""
324
        for docs in util.minibatch(stream, size=batch_size):
325
            for doc in docs:
326
                yield self(doc)
327
    
328
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
329
        """Update the component with training examples."""
330
        pass
331
    
332
    def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:
333
        """Initialize training."""
334
        pass
335
```
336

337
## Usage Examples
338

339
### Using Built-in Components
340

341
```python
342
import spacy
343

344
# Load model with multiple components
345
nlp = spacy.load("en_core_web_sm")
346
print("Pipeline components:", nlp.pipe_names)
347
# Output: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
348

349
# Process text through all components
350
doc = nlp("Apple Inc. is looking at buying U.K. startup for $1 billion")
351

352
# Access tagger results
353
for token in doc:
354
    print(f"{token.text}: {token.pos_} ({token.tag_})")
355

356
# Access parser results
357
for token in doc:
358
    print(f"{token.text} -> {token.head.text} ({token.dep_})")
359

360
# Access NER results
361
for ent in doc.ents:
362
    print(f"{ent.text}: {ent.label_}")
363
```
364

365
### Pipeline Management
366

367
```python
368
import spacy
369
from spacy.pipeline import EntityRuler
370

371
# Create blank language model
372
nlp = spacy.blank("en")
373

374
# Add components to pipeline
375
nlp.add_pipe("tagger")
376
nlp.add_pipe("parser")
377
nlp.add_pipe("ner")
378

379
# Add custom rule-based component
380
ruler = EntityRuler(nlp, patterns=[
381
    {"label": "COMPANY", "pattern": "Apple Inc."},
382
    {"label": "COMPANY", "pattern": "Microsoft Corp."}
383
])
384
nlp.add_pipe(ruler, before="ner")
385

386
# Process text
387
doc = nlp("Apple Inc. and Microsoft Corp. are tech companies")
388
for ent in doc.ents:
389
    print(f"{ent.text}: {ent.label_}")
390
```
391

392
### Disabling Components
393

394
```python
395
import spacy
396

397
nlp = spacy.load("en_core_web_sm")
398

399
# Disable specific components for faster processing
400
with nlp.disable_pipes("parser", "ner"):
401
    doc = nlp("This will only run tokenizer and tagger")
402
    
403
# Process multiple documents with disabled components
404
texts = ["Text one", "Text two", "Text three"]
405
with nlp.disable_pipes("parser"):
406
    docs = list(nlp.pipe(texts))
407
```
408

409
### Custom Pipeline Components
410

411
```python
412
from spacy.pipeline import Pipe
413
from spacy.tokens import Doc
414

415
class CustomComponent(Pipe):
416
    """Custom pipeline component example."""
417
    
418
    name = "custom_component"
419
    
420
    def __call__(self, doc):
421
        # Add custom processing logic
422
        for token in doc:
423
            if token.like_email:
424
                token._.is_email = True
425
        return doc
426

427
# Register and add to pipeline
428
@spacy.component("custom_component")
429
def create_custom_component(nlp, name):
430
    return CustomComponent()
431

432
nlp = spacy.blank("en")
433
nlp.add_pipe("custom_component")
434
```
435

436
### Text Classification
437

438
```python
439
import spacy
440

441
# Load model with text classifier
442
nlp = spacy.load("en_core_web_sm")
443

444
# Add text categorizer
445
textcat = nlp.add_pipe("textcat")
446
textcat.add_label("POSITIVE")
447
textcat.add_label("NEGATIVE")
448

449
# After training...
450
doc = nlp("This movie is great!")
451
print("Categories:", doc.cats)
452
# Output: {'POSITIVE': 0.9, 'NEGATIVE': 0.1}
453
```

Version

Tile

Files

pipeline-components.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

pipeline-components.mddocs/