0
# Pipeline Components
1
2
Built-in pipeline components that perform linguistic analysis on documents. These components can be combined in customizable processing pipelines to add part-of-speech tags, dependency parsing, named entity recognition, text classification, and more.
3
4
## Capabilities
5
6
### Part-of-Speech Tagging
7
8
Statistical models that assign part-of-speech tags and morphological features to tokens based on context and linguistic patterns.
9
10
```python { .api }
11
class Tagger:
12
"""Part-of-speech tagger pipeline component."""
13
14
name: str = "tagger"
15
16
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
17
"""Initialize the tagger."""
18
19
def __call__(self, doc: Doc) -> Doc:
20
"""Apply the tagger to a Doc object."""
21
22
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
23
"""Process documents in batches."""
24
25
def predict(self, docs: List[Doc]) -> Scores:
26
"""Predict part-of-speech tags for documents."""
27
28
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
29
"""Set part-of-speech annotations on documents."""
30
31
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
32
"""Update the model with training examples."""
33
34
def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:
35
"""Initialize training."""
36
37
def add_label(self, label: str) -> int:
38
"""Add a label to the component."""
39
40
# Serialization
41
def to_disk(self, path: str, exclude: List[str] = None) -> None:
42
"""Save the component to disk."""
43
44
def from_disk(self, path: str, exclude: List[str] = None) -> 'Tagger':
45
"""Load the component from disk."""
46
```
47
48
### Dependency Parsing
49
50
Statistical parser that predicts syntactic dependencies between tokens, creating a dependency tree structure.
51
52
```python { .api }
53
class DependencyParser:
54
"""Dependency parser pipeline component."""
55
56
name: str = "parser"
57
58
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
59
"""Initialize the parser."""
60
61
def __call__(self, doc: Doc) -> Doc:
62
"""Apply the parser to a Doc object."""
63
64
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
65
"""Process documents in batches."""
66
67
def predict(self, docs: List[Doc]) -> Scores:
68
"""Predict dependency relations for documents."""
69
70
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
71
"""Set dependency annotations on documents."""
72
73
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
74
"""Update the model with training examples."""
75
76
def add_label(self, label: str) -> int:
77
"""Add a dependency label."""
78
79
# Serialization methods similar to Tagger
80
```
81
82
### Named Entity Recognition
83
84
Statistical model that identifies and classifies named entities (people, organizations, locations, etc.) in text.
85
86
```python { .api }
87
class EntityRecognizer:
88
"""Named entity recognition pipeline component."""
89
90
name: str = "ner"
91
92
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
93
"""Initialize the NER component."""
94
95
def __call__(self, doc: Doc) -> Doc:
96
"""Apply NER to a Doc object."""
97
98
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
99
"""Process documents in batches."""
100
101
def predict(self, docs: List[Doc]) -> Scores:
102
"""Predict named entities for documents."""
103
104
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
105
"""Set named entity annotations on documents."""
106
107
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
108
"""Update the model with training examples."""
109
110
def add_label(self, label: str) -> int:
111
"""Add an entity label."""
112
113
# Serialization methods similar to Tagger
114
```
115
116
### Text Classification
117
118
Multi-label text classifier that assigns category scores to documents based on their content.
119
120
```python { .api }
121
class TextCategorizer:
122
"""Text classification pipeline component."""
123
124
name: str = "textcat"
125
126
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
127
"""Initialize the text categorizer."""
128
129
def __call__(self, doc: Doc) -> Doc:
130
"""Apply text categorization to a Doc object."""
131
132
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
133
"""Process documents in batches."""
134
135
def predict(self, docs: List[Doc]) -> Scores:
136
"""Predict category scores for documents."""
137
138
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
139
"""Set category annotations on documents."""
140
141
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
142
"""Update the model with training examples."""
143
144
def add_label(self, label: str) -> int:
145
"""Add a category label."""
146
147
@property
148
def labels(self) -> tuple:
149
"""Get category labels."""
150
```
151
152
### Entity Linking
153
154
Component that links named entities to entries in a knowledge base using entity embeddings and candidate ranking.
155
156
```python { .api }
157
class EntityLinker:
158
"""Entity linking pipeline component."""
159
160
name: str = "entity_linker"
161
162
def __init__(self, vocab: Vocab, **cfg) -> None:
163
"""Initialize the entity linker."""
164
165
def __call__(self, doc: Doc) -> Doc:
166
"""Apply entity linking to a Doc object."""
167
168
def predict(self, docs: List[Doc]) -> Scores:
169
"""Predict entity links for documents."""
170
171
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
172
"""Set entity linking annotations on documents."""
173
174
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
175
"""Update the model with training examples."""
176
177
def add_label(self, label: str) -> int:
178
"""Add an entity type label."""
179
180
def get_candidates(self, mention: Span) -> List:
181
"""Get knowledge base candidates for a mention."""
182
```
183
184
### Morphological Analysis
185
186
Component that analyzes word morphology and assigns detailed morphological features to tokens.
187
188
```python { .api }
189
class Morphologizer:
190
"""Morphological analysis pipeline component."""
191
192
name: str = "morphologizer"
193
194
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
195
"""Initialize the morphologizer."""
196
197
def __call__(self, doc: Doc) -> Doc:
198
"""Apply morphological analysis to a Doc object."""
199
200
def predict(self, docs: List[Doc]) -> Scores:
201
"""Predict morphological features for documents."""
202
203
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
204
"""Set morphological annotations on documents."""
205
206
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
207
"""Update the model with training examples."""
208
```
209
210
### Rule-Based Components
211
212
#### Entity Ruler
213
214
Rule-based component for pattern-based named entity recognition using token patterns or phrase matching.
215
216
```python { .api }
217
class EntityRuler:
218
"""Rule-based named entity recognition component."""
219
220
name: str = "entity_ruler"
221
222
def __init__(self, nlp: Language, patterns: List[dict] = None,
223
overwrite_ents: bool = False, **cfg) -> None:
224
"""Initialize the entity ruler."""
225
226
def __call__(self, doc: Doc) -> Doc:
227
"""Apply entity rules to a Doc object."""
228
229
def add_patterns(self, patterns: List[dict]) -> None:
230
"""Add patterns to the entity ruler."""
231
232
@property
233
def patterns(self) -> List[dict]:
234
"""Get all patterns."""
235
236
@property
237
def labels(self) -> set:
238
"""Get entity labels."""
239
240
# Serialization
241
def to_disk(self, path: str, exclude: List[str] = None) -> None:
242
"""Save patterns to disk."""
243
244
def from_disk(self, path: str, exclude: List[str] = None) -> 'EntityRuler':
245
"""Load patterns from disk."""
246
```
247
248
#### Sentence Boundary Detection
249
250
Fast, rule-based sentence boundary detection for most languages.
251
252
```python { .api }
253
class Sentencizer:
254
"""Rule-based sentence boundary detection component."""
255
256
name: str = "sentencizer"
257
258
def __init__(self, punct_chars: Set[str] = None, **cfg) -> None:
259
"""Initialize the sentencizer."""
260
261
def __call__(self, doc: Doc) -> Doc:
262
"""Apply sentence boundary detection to a Doc object."""
263
264
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
265
"""Process documents in batches."""
266
```
267
268
### Pipeline Management Functions
269
270
Functions for merging tokens based on linguistic analysis.
271
272
```python { .api }
273
def merge_entities(doc: Doc) -> Doc:
274
"""
275
Merge named entity tokens into single tokens.
276
277
Args:
278
doc: The Doc object to modify
279
280
Returns:
281
The modified Doc object
282
"""
283
284
def merge_noun_chunks(doc: Doc) -> Doc:
285
"""
286
Merge noun chunk tokens into single tokens.
287
288
Args:
289
doc: The Doc object to modify
290
291
Returns:
292
The modified Doc object
293
"""
294
295
def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
296
"""
297
Merge subtokens into single tokens.
298
299
Args:
300
doc: The Doc object to modify
301
label: Label for merged tokens
302
303
Returns:
304
The modified Doc object
305
"""
306
```
307
308
### Base Pipeline Component
309
310
Abstract base class for creating custom pipeline components.
311
312
```python { .api }
313
class Pipe:
314
"""Base class for pipeline components."""
315
316
name: str
317
318
def __call__(self, doc: Doc) -> Doc:
319
"""Apply the component to a Doc object."""
320
raise NotImplementedError
321
322
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
323
"""Process documents in batches."""
324
for docs in util.minibatch(stream, size=batch_size):
325
for doc in docs:
326
yield self(doc)
327
328
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
329
"""Update the component with training examples."""
330
pass
331
332
def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:
333
"""Initialize training."""
334
pass
335
```
336
337
## Usage Examples
338
339
### Using Built-in Components
340
341
```python
342
import spacy
343
344
# Load model with multiple components
345
nlp = spacy.load("en_core_web_sm")
346
print("Pipeline components:", nlp.pipe_names)
347
# Output: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
348
349
# Process text through all components
350
doc = nlp("Apple Inc. is looking at buying U.K. startup for $1 billion")
351
352
# Access tagger results
353
for token in doc:
354
print(f"{token.text}: {token.pos_} ({token.tag_})")
355
356
# Access parser results
357
for token in doc:
358
print(f"{token.text} -> {token.head.text} ({token.dep_})")
359
360
# Access NER results
361
for ent in doc.ents:
362
print(f"{ent.text}: {ent.label_}")
363
```
364
365
### Pipeline Management
366
367
```python
368
import spacy
369
from spacy.pipeline import EntityRuler
370
371
# Create blank language model
372
nlp = spacy.blank("en")
373
374
# Add components to pipeline
375
nlp.add_pipe("tagger")
376
nlp.add_pipe("parser")
377
nlp.add_pipe("ner")
378
379
# Add custom rule-based component
380
ruler = EntityRuler(nlp, patterns=[
381
{"label": "COMPANY", "pattern": "Apple Inc."},
382
{"label": "COMPANY", "pattern": "Microsoft Corp."}
383
])
384
nlp.add_pipe(ruler, before="ner")
385
386
# Process text
387
doc = nlp("Apple Inc. and Microsoft Corp. are tech companies")
388
for ent in doc.ents:
389
print(f"{ent.text}: {ent.label_}")
390
```
391
392
### Disabling Components
393
394
```python
395
import spacy
396
397
nlp = spacy.load("en_core_web_sm")
398
399
# Disable specific components for faster processing
400
with nlp.disable_pipes("parser", "ner"):
401
doc = nlp("This will only run tokenizer and tagger")
402
403
# Process multiple documents with disabled components
404
texts = ["Text one", "Text two", "Text three"]
405
with nlp.disable_pipes("parser"):
406
docs = list(nlp.pipe(texts))
407
```
408
409
### Custom Pipeline Components
410
411
```python
412
from spacy.pipeline import Pipe
413
from spacy.tokens import Doc
414
415
class CustomComponent(Pipe):
416
"""Custom pipeline component example."""
417
418
name = "custom_component"
419
420
def __call__(self, doc):
421
# Add custom processing logic
422
for token in doc:
423
if token.like_email:
424
token._.is_email = True
425
return doc
426
427
# Register and add to pipeline
428
@spacy.component("custom_component")
429
def create_custom_component(nlp, name):
430
return CustomComponent()
431
432
nlp = spacy.blank("en")
433
nlp.add_pipe("custom_component")
434
```
435
436
### Text Classification
437
438
```python
439
import spacy
440
441
# Load model with text classifier
442
nlp = spacy.load("en_core_web_sm")
443
444
# Add text categorizer
445
textcat = nlp.add_pipe("textcat")
446
textcat.add_label("POSITIVE")
447
textcat.add_label("NEGATIVE")
448
449
# After training...
450
doc = nlp("This movie is great!")
451
print("Categories:", doc.cats)
452
# Output: {'POSITIVE': 0.9, 'NEGATIVE': 0.1}
453
```