0
# Core Processing Objects
1
2
The fundamental objects for text processing in spaCy. These classes form the foundation of all NLP operations and provide access to linguistic annotations, document structure, and vocabulary management.
3
4
## Capabilities
5
6
### Language Pipeline
7
8
The main entry point for NLP processing. The Language class manages the processing pipeline and provides methods for processing single texts or batches efficiently.
9
10
```python { .api }
11
class Language:
12
"""Main NLP pipeline class that processes text through pipeline components."""
13
14
vocab: Vocab
15
pipeline: List[tuple]
16
pipe_names: List[str]
17
meta: dict
18
19
def __call__(self, text: str) -> Doc:
20
"""Process a single text and return a Doc object."""
21
22
def pipe(self, texts: Iterable[str],
23
batch_size: int = 1000,
24
disable: List[str] = None,
25
component_cfg: dict = None,
26
n_process: int = 1) -> Iterator[Doc]:
27
"""Process multiple texts efficiently."""
28
29
def update(self, examples: List, sgd=None, **kwargs) -> dict:
30
"""Update the model with training examples."""
31
32
def begin_training(self, get_examples=None, **kwargs) -> Optimizer:
33
"""Initialize training and return optimizer."""
34
35
def evaluate(self, examples: List, **kwargs) -> dict:
36
"""Evaluate the model on examples."""
37
38
# Pipeline management
39
def add_pipe(self, component, name: str = None,
40
before: str = None, after: str = None,
41
first: bool = False, last: bool = False) -> callable:
42
"""Add a component to the processing pipeline."""
43
44
def remove_pipe(self, name: str) -> tuple:
45
"""Remove a component from the pipeline."""
46
47
def get_pipe(self, name: str) -> callable:
48
"""Get a pipeline component by name."""
49
50
def has_pipe(self, name: str) -> bool:
51
"""Check if pipeline has a component."""
52
53
def disable_pipes(self, *names) -> ContextManager:
54
"""Temporarily disable pipeline components."""
55
56
# Serialization
57
def to_disk(self, path: str, exclude: List[str] = None) -> None:
58
"""Save the model to disk."""
59
60
def from_disk(self, path: str, exclude: List[str] = None) -> 'Language':
61
"""Load the model from disk."""
62
63
def to_bytes(self, exclude: List[str] = None) -> bytes:
64
"""Serialize the model to bytes."""
65
66
def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Language':
67
"""Load the model from bytes."""
68
```
69
70
### Document Container
71
72
The Doc class represents a document with token-level and document-level annotations. It provides access to the parsed text structure and linguistic analysis.
73
74
```python { .api }
75
class Doc:
76
"""Container for accessing linguistic annotations on a document."""
77
78
text: str
79
text_with_ws: str
80
ents: tuple
81
noun_chunks: Iterator
82
sents: Iterator
83
vector: numpy.ndarray
84
lang_: str
85
is_parsed: bool
86
is_tagged: bool
87
is_sentenced: bool
88
89
def __init__(self, vocab: Vocab, words: List[str] = None,
90
spaces: List[bool] = None) -> None:
91
"""Create a Doc object."""
92
93
def __getitem__(self, i: Union[int, slice]) -> Union[Token, Span]:
94
"""Get a token or span."""
95
96
def __iter__(self) -> Iterator[Token]:
97
"""Iterate over tokens."""
98
99
def __len__(self) -> int:
100
"""Number of tokens."""
101
102
def similarity(self, other: Union['Doc', 'Span', 'Token']) -> float:
103
"""Compute semantic similarity."""
104
105
def char_span(self, start: int, end: int,
106
label: str = None, kb_id: str = None) -> Span:
107
"""Create a Span from character positions."""
108
109
def count_by(self, attr: int, exclude: Set = None) -> dict:
110
"""Count tokens by attribute."""
111
112
def to_json(self, underscore: List[str] = None) -> dict:
113
"""Export to JSON format."""
114
115
def retokenize(self) -> ContextManager:
116
"""Context manager for merging/splitting tokens."""
117
118
# Serialization
119
def to_disk(self, path: str, exclude: List[str] = None) -> None:
120
"""Save the doc to disk."""
121
122
def from_disk(self, path: str, exclude: List[str] = None) -> 'Doc':
123
"""Load the doc from disk."""
124
125
def to_bytes(self, exclude: List[str] = None) -> bytes:
126
"""Serialize the doc to bytes."""
127
128
def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Doc':
129
"""Load the doc from bytes."""
130
```
131
132
### Token Annotations
133
134
Individual tokens with comprehensive linguistic annotations including morphology, syntax, and semantic properties.
135
136
```python { .api }
137
class Token:
138
"""Individual token with linguistic annotations."""
139
140
# Text properties
141
text: str
142
text_with_ws: str
143
whitespace_: str
144
orth: int
145
orth_: str
146
147
# Linguistic annotations
148
lemma: int
149
lemma_: str
150
pos: int
151
pos_: str
152
tag: int
153
tag_: str
154
dep: int
155
dep_: str
156
157
# Morphological features
158
morph: MorphAnalysis
159
160
# Named entity information
161
ent_type: int
162
ent_type_: str
163
ent_iob: int
164
ent_iob_: str
165
ent_kb_id: int
166
ent_kb_id_: str
167
ent_id: int
168
ent_id_: str
169
170
# Syntactic relationships
171
head: 'Token'
172
children: Iterator['Token']
173
ancestors: Iterator['Token']
174
subtree: Iterator['Token']
175
lefts: Iterator['Token']
176
rights: Iterator['Token']
177
n_lefts: int
178
n_rights: int
179
180
# Boolean flags
181
is_alpha: bool
182
is_ascii: bool
183
is_digit: bool
184
is_lower: bool
185
is_upper: bool
186
is_title: bool
187
is_punct: bool
188
is_space: bool
189
is_bracket: bool
190
is_quote: bool
191
is_stop: bool
192
like_url: bool
193
like_num: bool
194
like_email: bool
195
196
# Vector representation
197
vector: numpy.ndarray
198
has_vector: bool
199
vector_norm: float
200
201
def similarity(self, other: Union['Token', 'Span', 'Doc']) -> float:
202
"""Compute semantic similarity."""
203
204
def nbor(self, i: int = 1) -> 'Token':
205
"""Get neighboring token."""
206
207
def is_ancestor(self, descendant: 'Token') -> bool:
208
"""Check if token is ancestor of another."""
209
```
210
211
### Span Objects
212
213
Spans represent slices of documents, typically used for named entities, noun chunks, or custom text segments.
214
215
```python { .api }
216
class Span:
217
"""Slice of a document with optional label and attributes."""
218
219
text: str
220
text_with_ws: str
221
label: int
222
label_: str
223
kb_id: int
224
kb_id_: str
225
ent_id: int
226
ent_id_: str
227
228
start: int
229
end: int
230
start_char: int
231
end_char: int
232
233
vector: numpy.ndarray
234
235
doc: Doc
236
sent: 'Span'
237
root: Token
238
ents: tuple
239
240
def __init__(self, doc: Doc, start: int, end: int,
241
label: int = 0, kb_id: int = 0) -> None:
242
"""Create a Span object."""
243
244
def __getitem__(self, i: Union[int, slice]) -> Union[Token, 'Span']:
245
"""Get token or subspan."""
246
247
def __iter__(self) -> Iterator[Token]:
248
"""Iterate over tokens."""
249
250
def __len__(self) -> int:
251
"""Number of tokens in span."""
252
253
def similarity(self, other: Union['Span', 'Doc', 'Token']) -> float:
254
"""Compute semantic similarity."""
255
256
def as_doc(self) -> Doc:
257
"""Create a new Doc object from the span."""
258
259
def char_span(self, start: int, end: int,
260
label: str = None, kb_id: str = None) -> 'Span':
261
"""Create a subspan from character positions."""
262
263
def conjuncts(self) -> List['Span']:
264
"""Get conjunct spans."""
265
```
266
267
### Vocabulary Management
268
269
The vocabulary stores all strings, word vectors, and lexical entries used by the language model.
270
271
```python { .api }
272
class Vocab:
273
"""Vocabulary store for strings, vectors, and lexical entries."""
274
275
strings: StringStore
276
vectors: Vectors
277
lookups: Lookups
278
writing_system: dict
279
280
def __init__(self, lex_attr_getters: dict = None,
281
strings: StringStore = None,
282
lookups: Lookups = None,
283
oov_prob: float = -20.0) -> None:
284
"""Create a vocabulary."""
285
286
def __getitem__(self, id_or_string: Union[int, str]) -> Lexeme:
287
"""Get a lexeme."""
288
289
def __iter__(self) -> Iterator[Lexeme]:
290
"""Iterate over lexemes."""
291
292
def __len__(self) -> int:
293
"""Number of lexemes."""
294
295
def __contains__(self, string: str) -> bool:
296
"""Check if string is in vocabulary."""
297
298
def add_flag(self, flag_getter: callable, flag_id: int = None) -> int:
299
"""Add a boolean flag attribute."""
300
301
def get_vector(self, orth: Union[int, str]) -> numpy.ndarray:
302
"""Get word vector."""
303
304
def set_vector(self, orth: Union[int, str], vector: numpy.ndarray) -> None:
305
"""Set word vector."""
306
307
def has_vector(self, orth: Union[int, str]) -> bool:
308
"""Check if word has vector."""
309
310
# Serialization
311
def to_disk(self, path: str, exclude: List[str] = None) -> None:
312
"""Save vocabulary to disk."""
313
314
def from_disk(self, path: str, exclude: List[str] = None) -> 'Vocab':
315
"""Load vocabulary from disk."""
316
```
317
318
### Lexeme Objects
319
320
Lexemes store word-type information in the vocabulary, independent of context.
321
322
```python { .api }
323
class Lexeme:
324
"""Word type stored in vocabulary."""
325
326
# Text properties
327
orth: int
328
orth_: str
329
text: str
330
lower: int
331
lower_: str
332
norm: int
333
norm_: str
334
shape: int
335
shape_: str
336
prefix: int
337
prefix_: str
338
suffix: int
339
suffix_: str
340
341
# Boolean flags
342
is_alpha: bool
343
is_ascii: bool
344
is_digit: bool
345
is_lower: bool
346
is_upper: bool
347
is_title: bool
348
is_punct: bool
349
is_space: bool
350
is_bracket: bool
351
is_quote: bool
352
is_stop: bool
353
like_url: bool
354
like_num: bool
355
like_email: bool
356
357
# Vector representation
358
vector: numpy.ndarray
359
has_vector: bool
360
vector_norm: float
361
362
# Probability and sentiment
363
prob: float
364
sentiment: float
365
366
def similarity(self, other: Union['Lexeme', 'Token']) -> float:
367
"""Compute semantic similarity."""
368
```
369
370
### Document Collections
371
372
Efficient storage and serialization for multiple documents.
373
374
```python { .api }
375
class DocBin:
376
"""Efficient storage for multiple Doc objects."""
377
378
def __init__(self, attrs: List[str] = None, store_user_data: bool = False) -> None:
379
"""Create a DocBin for storing multiple documents."""
380
381
def __len__(self) -> int:
382
"""Number of documents in the collection."""
383
384
def add(self, doc: Doc) -> None:
385
"""Add a Doc object to the collection."""
386
387
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
388
"""Retrieve Doc objects from the collection."""
389
390
def merge(self, other: 'DocBin') -> None:
391
"""Merge another DocBin into this one."""
392
393
# Serialization
394
def to_disk(self, path: str) -> None:
395
"""Save the DocBin to disk."""
396
397
def from_disk(self, path: str) -> 'DocBin':
398
"""Load the DocBin from disk."""
399
400
def to_bytes(self) -> bytes:
401
"""Serialize to bytes."""
402
403
def from_bytes(self, bytes_data: bytes) -> 'DocBin':
404
"""Deserialize from bytes."""
405
```
406
407
### Document Modification
408
409
Tools for modifying document tokenization after initial processing.
410
411
```python { .api }
412
class Retokenizer:
413
"""Context manager for modifying document tokenization."""
414
415
def merge(self, span: Span, attrs: dict = None) -> None:
416
"""
417
Merge a span into a single token.
418
419
Args:
420
span: The span to merge
421
attrs: Optional token attributes for merged token
422
"""
423
424
def split(self, token: Token, orths: List[str],
425
heads: List[tuple] = None, attrs: dict = None) -> None:
426
"""
427
Split a token into multiple tokens.
428
429
Args:
430
token: The token to split
431
orths: List of orthographic forms for new tokens
432
heads: List of (head_index, dep_label) tuples
433
attrs: Optional token attributes
434
"""
435
```
436
437
### Morphological Analysis
438
439
Container for morphological feature analysis.
440
441
```python { .api }
442
class MorphAnalysis:
443
"""Morphological analysis container."""
444
445
def __init__(self, vocab: Vocab, features: dict = None) -> None:
446
"""Create morphological analysis."""
447
448
def __str__(self) -> str:
449
"""String representation of morphological features."""
450
451
def get(self, field: str) -> List[str]:
452
"""Get values for a morphological field."""
453
454
def to_dict(self) -> dict:
455
"""Convert to dictionary format."""
456
457
@classmethod
458
def from_id(cls, vocab: Vocab, key: int) -> 'MorphAnalysis':
459
"""Create from vocabulary ID."""
460
```
461
462
### Lookup Tables
463
464
Management system for linguistic lookup tables and data.
465
466
```python { .api }
467
class Lookups:
468
"""Lookup table management system."""
469
470
def __init__(self) -> None:
471
"""Create empty lookup tables."""
472
473
def add_table(self, name: str, data: dict = None) -> dict:
474
"""Add a lookup table."""
475
476
def get_table(self, name: str, default: dict = None) -> dict:
477
"""Get a lookup table by name."""
478
479
def has_table(self, name: str) -> bool:
480
"""Check if table exists."""
481
482
def remove_table(self, name: str) -> dict:
483
"""Remove and return a table."""
484
485
def to_disk(self, path: str, exclude: List[str] = None) -> None:
486
"""Save lookup tables to disk."""
487
488
def from_disk(self, path: str, exclude: List[str] = None) -> 'Lookups':
489
"""Load lookup tables from disk."""
490
```
491
492
### Lemmatization
493
494
System for reducing words to their lemmatized forms.
495
496
```python { .api }
497
class Lemmatizer:
498
"""Lemmatization component."""
499
500
def __init__(self, lookups: Lookups = None, rules: dict = None) -> None:
501
"""Initialize lemmatizer."""
502
503
def lookup(self, string: str, pos: str = None, morphs: dict = None) -> List[str]:
504
"""Look up lemma in tables."""
505
506
def rule_lookup(self, string: str, pos: str) -> List[str]:
507
"""Apply lemmatization rules."""
508
509
def lookup_table(self, string: str, table: str) -> List[str]:
510
"""Look up in specific table."""
511
512
def is_base_form(self, univ_pos: str, morphs: dict = None) -> bool:
513
"""Check if token is in base form."""
514
```
515
516
### String Store
517
518
Efficient bidirectional mapping between strings and integer IDs.
519
520
```python { .api }
521
class StringStore:
522
"""Bidirectional map between strings and integer IDs."""
523
524
def __init__(self, strings: Iterable[str] = None) -> None:
525
"""Create a string store."""
526
527
def __getitem__(self, id_or_string: Union[int, str]) -> Union[str, int]:
528
"""Get string by ID or ID by string."""
529
530
def __contains__(self, string: str) -> bool:
531
"""Check if string is in store."""
532
533
def __iter__(self) -> Iterator[str]:
534
"""Iterate over strings."""
535
536
def __len__(self) -> int:
537
"""Number of strings."""
538
539
def add(self, string: str) -> int:
540
"""Add string and return ID."""
541
542
# Serialization
543
def to_disk(self, path: str) -> None:
544
"""Save string store to disk."""
545
546
def from_disk(self, path: str) -> 'StringStore':
547
"""Load string store from disk."""
548
549
def to_bytes(self) -> bytes:
550
"""Serialize to bytes."""
551
552
def from_bytes(self, bytes_data: bytes) -> 'StringStore':
553
"""Deserialize from bytes."""
554
```
555
556
## Usage Examples
557
558
### Processing Documents
559
560
```python
561
import spacy
562
563
nlp = spacy.load("en_core_web_sm")
564
565
# Process single document
566
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
567
568
# Access document properties
569
print(f"Text: {doc.text}")
570
print(f"Number of tokens: {len(doc)}")
571
print(f"Number of sentences: {len(list(doc.sents))}")
572
573
# Iterate over tokens
574
for token in doc:
575
print(f"{token.text}: {token.pos_} ({token.lemma_})")
576
577
# Process multiple documents efficiently
578
texts = ["First document", "Second document", "Third document"]
579
for doc in nlp.pipe(texts):
580
print(f"Processed: {doc.text}")
581
```
582
583
### Working with Spans
584
585
```python
586
# Create custom spans
587
doc = nlp("Apple is looking at buying U.K. startup")
588
company_span = doc[0:1] # "Apple"
589
target_span = doc[4:7] # "U.K. startup"
590
591
# Named entity spans
592
for ent in doc.ents:
593
print(f"Entity: {ent.text} ({ent.label_})")
594
print(f"Start: {ent.start}, End: {ent.end}")
595
596
# Create span from character positions
597
char_span = doc.char_span(0, 5, label="ORG") # "Apple"
598
if char_span:
599
print(f"Character span: {char_span.text}")
600
```
601
602
### Vocabulary Operations
603
604
```python
605
# Access vocabulary
606
vocab = nlp.vocab
607
608
# Get lexeme
609
apple_lexeme = vocab["apple"]
610
print(f"Is alpha: {apple_lexeme.is_alpha}")
611
print(f"Is stop word: {apple_lexeme.is_stop}")
612
613
# String store operations
614
string_id = vocab.strings.add("custom_token")
615
retrieved_string = vocab.strings[string_id]
616
print(f"String ID: {string_id}, Retrieved: {retrieved_string}")
617
```