Tessl Tile for pypi/spacy@2.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-objects.md index.md language-models.md pattern-matching.md pipeline-components.md training.md visualization.md

core-objects.mddocs/

0
# Core Processing Objects
1

2
The fundamental objects for text processing in spaCy. These classes form the foundation of all NLP operations and provide access to linguistic annotations, document structure, and vocabulary management.
3

4
## Capabilities
5

6
### Language Pipeline
7

8
The main entry point for NLP processing. The Language class manages the processing pipeline and provides methods for processing single texts or batches efficiently.
9

10
```python { .api }
11
class Language:
12
    """Main NLP pipeline class that processes text through pipeline components."""
13
    
14
    vocab: Vocab
15
    pipeline: List[tuple]
16
    pipe_names: List[str]
17
    meta: dict
18
    
19
    def __call__(self, text: str) -> Doc:
20
        """Process a single text and return a Doc object."""
21
    
22
    def pipe(self, texts: Iterable[str], 
23
             batch_size: int = 1000, 
24
             disable: List[str] = None,
25
             component_cfg: dict = None,
26
             n_process: int = 1) -> Iterator[Doc]:
27
        """Process multiple texts efficiently."""
28
    
29
    def update(self, examples: List, sgd=None, **kwargs) -> dict:
30
        """Update the model with training examples."""
31
    
32
    def begin_training(self, get_examples=None, **kwargs) -> Optimizer:
33
        """Initialize training and return optimizer."""
34
    
35
    def evaluate(self, examples: List, **kwargs) -> dict:
36
        """Evaluate the model on examples."""
37
    
38
    # Pipeline management
39
    def add_pipe(self, component, name: str = None, 
40
                 before: str = None, after: str = None,
41
                 first: bool = False, last: bool = False) -> callable:
42
        """Add a component to the processing pipeline."""
43
    
44
    def remove_pipe(self, name: str) -> tuple:
45
        """Remove a component from the pipeline."""
46
    
47
    def get_pipe(self, name: str) -> callable:
48
        """Get a pipeline component by name."""
49
    
50
    def has_pipe(self, name: str) -> bool:
51
        """Check if pipeline has a component."""
52
    
53
    def disable_pipes(self, *names) -> ContextManager:
54
        """Temporarily disable pipeline components."""
55
    
56
    # Serialization
57
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
58
        """Save the model to disk."""
59
    
60
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Language':
61
        """Load the model from disk."""
62
    
63
    def to_bytes(self, exclude: List[str] = None) -> bytes:
64
        """Serialize the model to bytes."""
65
    
66
    def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Language':
67
        """Load the model from bytes."""
68
```
69

70
### Document Container
71

72
The Doc class represents a document with token-level and document-level annotations. It provides access to the parsed text structure and linguistic analysis.
73

74
```python { .api }
75
class Doc:
76
    """Container for accessing linguistic annotations on a document."""
77
    
78
    text: str
79
    text_with_ws: str
80
    ents: tuple
81
    noun_chunks: Iterator
82
    sents: Iterator
83
    vector: numpy.ndarray
84
    lang_: str
85
    is_parsed: bool
86
    is_tagged: bool
87
    is_sentenced: bool
88
    
89
    def __init__(self, vocab: Vocab, words: List[str] = None, 
90
                 spaces: List[bool] = None) -> None:
91
        """Create a Doc object."""
92
    
93
    def __getitem__(self, i: Union[int, slice]) -> Union[Token, Span]:
94
        """Get a token or span."""
95
    
96
    def __iter__(self) -> Iterator[Token]:
97
        """Iterate over tokens."""
98
    
99
    def __len__(self) -> int:
100
        """Number of tokens."""
101
    
102
    def similarity(self, other: Union['Doc', 'Span', 'Token']) -> float:
103
        """Compute semantic similarity."""
104
    
105
    def char_span(self, start: int, end: int, 
106
                  label: str = None, kb_id: str = None) -> Span:
107
        """Create a Span from character positions."""
108
    
109
    def count_by(self, attr: int, exclude: Set = None) -> dict:
110
        """Count tokens by attribute."""
111
    
112
    def to_json(self, underscore: List[str] = None) -> dict:
113
        """Export to JSON format."""
114
    
115
    def retokenize(self) -> ContextManager:
116
        """Context manager for merging/splitting tokens."""
117
    
118
    # Serialization
119
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
120
        """Save the doc to disk."""
121
    
122
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Doc':
123
        """Load the doc from disk."""
124
    
125
    def to_bytes(self, exclude: List[str] = None) -> bytes:
126
        """Serialize the doc to bytes."""
127
    
128
    def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Doc':
129
        """Load the doc from bytes."""
130
```
131

132
### Token Annotations
133

134
Individual tokens with comprehensive linguistic annotations including morphology, syntax, and semantic properties.
135

136
```python { .api }
137
class Token:
138
    """Individual token with linguistic annotations."""
139
    
140
    # Text properties
141
    text: str
142
    text_with_ws: str
143
    whitespace_: str
144
    orth: int
145
    orth_: str
146
    
147
    # Linguistic annotations
148
    lemma: int
149
    lemma_: str
150
    pos: int
151
    pos_: str
152
    tag: int
153
    tag_: str
154
    dep: int
155
    dep_: str
156
    
157
    # Morphological features
158
    morph: MorphAnalysis
159
    
160
    # Named entity information
161
    ent_type: int
162
    ent_type_: str
163
    ent_iob: int
164
    ent_iob_: str
165
    ent_kb_id: int
166
    ent_kb_id_: str
167
    ent_id: int
168
    ent_id_: str
169
    
170
    # Syntactic relationships
171
    head: 'Token'
172
    children: Iterator['Token']
173
    ancestors: Iterator['Token']
174
    subtree: Iterator['Token']
175
    lefts: Iterator['Token']
176
    rights: Iterator['Token']
177
    n_lefts: int
178
    n_rights: int
179
    
180
    # Boolean flags
181
    is_alpha: bool
182
    is_ascii: bool
183
    is_digit: bool
184
    is_lower: bool
185
    is_upper: bool
186
    is_title: bool
187
    is_punct: bool
188
    is_space: bool
189
    is_bracket: bool
190
    is_quote: bool
191
    is_stop: bool
192
    like_url: bool
193
    like_num: bool
194
    like_email: bool
195
    
196
    # Vector representation
197
    vector: numpy.ndarray
198
    has_vector: bool
199
    vector_norm: float
200
    
201
    def similarity(self, other: Union['Token', 'Span', 'Doc']) -> float:
202
        """Compute semantic similarity."""
203
    
204
    def nbor(self, i: int = 1) -> 'Token':
205
        """Get neighboring token."""
206
    
207
    def is_ancestor(self, descendant: 'Token') -> bool:
208
        """Check if token is ancestor of another."""
209
```
210

211
### Span Objects
212

213
Spans represent slices of documents, typically used for named entities, noun chunks, or custom text segments.
214

215
```python { .api }
216
class Span:
217
    """Slice of a document with optional label and attributes."""
218
    
219
    text: str
220
    text_with_ws: str
221
    label: int
222
    label_: str
223
    kb_id: int
224
    kb_id_: str
225
    ent_id: int
226
    ent_id_: str
227
    
228
    start: int
229
    end: int
230
    start_char: int
231
    end_char: int
232
    
233
    vector: numpy.ndarray
234
    
235
    doc: Doc
236
    sent: 'Span'
237
    root: Token
238
    ents: tuple
239
    
240
    def __init__(self, doc: Doc, start: int, end: int,
241
                 label: int = 0, kb_id: int = 0) -> None:
242
        """Create a Span object."""
243
    
244
    def __getitem__(self, i: Union[int, slice]) -> Union[Token, 'Span']:
245
        """Get token or subspan."""
246
    
247
    def __iter__(self) -> Iterator[Token]:
248
        """Iterate over tokens."""
249
    
250
    def __len__(self) -> int:
251
        """Number of tokens in span."""
252
    
253
    def similarity(self, other: Union['Span', 'Doc', 'Token']) -> float:
254
        """Compute semantic similarity."""
255
    
256
    def as_doc(self) -> Doc:
257
        """Create a new Doc object from the span."""
258
    
259
    def char_span(self, start: int, end: int, 
260
                  label: str = None, kb_id: str = None) -> 'Span':
261
        """Create a subspan from character positions."""
262
    
263
    def conjuncts(self) -> List['Span']:
264
        """Get conjunct spans."""
265
```
266

267
### Vocabulary Management
268

269
The vocabulary stores all strings, word vectors, and lexical entries used by the language model.
270

271
```python { .api }
272
class Vocab:
273
    """Vocabulary store for strings, vectors, and lexical entries."""
274
    
275
    strings: StringStore
276
    vectors: Vectors
277
    lookups: Lookups
278
    writing_system: dict
279
    
280
    def __init__(self, lex_attr_getters: dict = None, 
281
                 strings: StringStore = None,
282
                 lookups: Lookups = None,
283
                 oov_prob: float = -20.0) -> None:
284
        """Create a vocabulary."""
285
    
286
    def __getitem__(self, id_or_string: Union[int, str]) -> Lexeme:
287
        """Get a lexeme."""
288
    
289
    def __iter__(self) -> Iterator[Lexeme]:
290
        """Iterate over lexemes."""
291
    
292
    def __len__(self) -> int:
293
        """Number of lexemes."""
294
    
295
    def __contains__(self, string: str) -> bool:
296
        """Check if string is in vocabulary."""
297
    
298
    def add_flag(self, flag_getter: callable, flag_id: int = None) -> int:
299
        """Add a boolean flag attribute."""
300
    
301
    def get_vector(self, orth: Union[int, str]) -> numpy.ndarray:
302
        """Get word vector."""
303
    
304
    def set_vector(self, orth: Union[int, str], vector: numpy.ndarray) -> None:
305
        """Set word vector."""
306
    
307
    def has_vector(self, orth: Union[int, str]) -> bool:
308
        """Check if word has vector."""
309
    
310
    # Serialization
311
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
312
        """Save vocabulary to disk."""
313
    
314
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Vocab':
315
        """Load vocabulary from disk."""
316
```
317

318
### Lexeme Objects
319

320
Lexemes store word-type information in the vocabulary, independent of context.
321

322
```python { .api }
323
class Lexeme:
324
    """Word type stored in vocabulary."""
325
    
326
    # Text properties
327
    orth: int
328
    orth_: str
329
    text: str
330
    lower: int
331
    lower_: str
332
    norm: int
333
    norm_: str
334
    shape: int
335
    shape_: str
336
    prefix: int
337
    prefix_: str
338
    suffix: int
339
    suffix_: str
340
    
341
    # Boolean flags
342
    is_alpha: bool
343
    is_ascii: bool
344
    is_digit: bool
345
    is_lower: bool
346
    is_upper: bool
347
    is_title: bool
348
    is_punct: bool
349
    is_space: bool
350
    is_bracket: bool 
351
    is_quote: bool
352
    is_stop: bool
353
    like_url: bool
354
    like_num: bool
355
    like_email: bool
356
    
357
    # Vector representation
358
    vector: numpy.ndarray
359
    has_vector: bool
360
    vector_norm: float
361
    
362
    # Probability and sentiment
363
    prob: float
364
    sentiment: float
365
    
366
    def similarity(self, other: Union['Lexeme', 'Token']) -> float:
367
        """Compute semantic similarity."""
368
```
369

370
### Document Collections
371

372
Efficient storage and serialization for multiple documents.
373

374
```python { .api }
375
class DocBin:
376
    """Efficient storage for multiple Doc objects."""
377
    
378
    def __init__(self, attrs: List[str] = None, store_user_data: bool = False) -> None:
379
        """Create a DocBin for storing multiple documents."""
380
    
381
    def __len__(self) -> int:
382
        """Number of documents in the collection."""
383
    
384
    def add(self, doc: Doc) -> None:
385
        """Add a Doc object to the collection."""
386
    
387
    def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
388
        """Retrieve Doc objects from the collection."""
389
    
390
    def merge(self, other: 'DocBin') -> None:
391
        """Merge another DocBin into this one."""
392
    
393
    # Serialization
394
    def to_disk(self, path: str) -> None:
395
        """Save the DocBin to disk."""
396
    
397
    def from_disk(self, path: str) -> 'DocBin':
398
        """Load the DocBin from disk."""
399
    
400
    def to_bytes(self) -> bytes:
401
        """Serialize to bytes."""
402
    
403
    def from_bytes(self, bytes_data: bytes) -> 'DocBin':
404
        """Deserialize from bytes."""
405
```
406

407
### Document Modification
408

409
Tools for modifying document tokenization after initial processing.
410

411
```python { .api }
412
class Retokenizer:
413
    """Context manager for modifying document tokenization."""
414
    
415
    def merge(self, span: Span, attrs: dict = None) -> None:
416
        """
417
        Merge a span into a single token.
418
        
419
        Args:
420
            span: The span to merge
421
            attrs: Optional token attributes for merged token
422
        """
423
    
424
    def split(self, token: Token, orths: List[str], 
425
             heads: List[tuple] = None, attrs: dict = None) -> None:
426
        """
427
        Split a token into multiple tokens.
428
        
429
        Args:
430
            token: The token to split
431
            orths: List of orthographic forms for new tokens
432
            heads: List of (head_index, dep_label) tuples
433
            attrs: Optional token attributes
434
        """
435
```
436

437
### Morphological Analysis
438

439
Container for morphological feature analysis.
440

441
```python { .api }
442
class MorphAnalysis:
443
    """Morphological analysis container."""
444
    
445
    def __init__(self, vocab: Vocab, features: dict = None) -> None:
446
        """Create morphological analysis."""
447
    
448
    def __str__(self) -> str:
449
        """String representation of morphological features."""
450
    
451
    def get(self, field: str) -> List[str]:
452
        """Get values for a morphological field."""
453
    
454
    def to_dict(self) -> dict:
455
        """Convert to dictionary format."""
456
    
457
    @classmethod
458
    def from_id(cls, vocab: Vocab, key: int) -> 'MorphAnalysis':
459
        """Create from vocabulary ID."""
460
```
461

462
### Lookup Tables
463

464
Management system for linguistic lookup tables and data.
465

466
```python { .api }
467
class Lookups:
468
    """Lookup table management system."""
469
    
470
    def __init__(self) -> None:
471
        """Create empty lookup tables."""
472
    
473
    def add_table(self, name: str, data: dict = None) -> dict:
474
        """Add a lookup table."""
475
    
476
    def get_table(self, name: str, default: dict = None) -> dict:
477
        """Get a lookup table by name."""
478
    
479
    def has_table(self, name: str) -> bool:
480
        """Check if table exists."""
481
    
482
    def remove_table(self, name: str) -> dict:
483
        """Remove and return a table."""
484
    
485
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
486
        """Save lookup tables to disk."""
487
    
488
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Lookups':
489
        """Load lookup tables from disk."""
490
```
491

492
### Lemmatization
493

494
System for reducing words to their lemmatized forms.
495

496
```python { .api }
497
class Lemmatizer:
498
    """Lemmatization component."""
499
    
500
    def __init__(self, lookups: Lookups = None, rules: dict = None) -> None:
501
        """Initialize lemmatizer."""
502
    
503
    def lookup(self, string: str, pos: str = None, morphs: dict = None) -> List[str]:
504
        """Look up lemma in tables."""
505
    
506
    def rule_lookup(self, string: str, pos: str) -> List[str]:
507
        """Apply lemmatization rules."""
508
    
509
    def lookup_table(self, string: str, table: str) -> List[str]:
510
        """Look up in specific table."""
511
    
512
    def is_base_form(self, univ_pos: str, morphs: dict = None) -> bool:
513
        """Check if token is in base form."""
514
```
515

516
### String Store
517

518
Efficient bidirectional mapping between strings and integer IDs.
519

520
```python { .api }
521
class StringStore:
522
    """Bidirectional map between strings and integer IDs."""
523
    
524
    def __init__(self, strings: Iterable[str] = None) -> None:
525
        """Create a string store."""
526
    
527
    def __getitem__(self, id_or_string: Union[int, str]) -> Union[str, int]:
528
        """Get string by ID or ID by string."""
529
    
530
    def __contains__(self, string: str) -> bool:
531
        """Check if string is in store."""
532
    
533
    def __iter__(self) -> Iterator[str]:
534
        """Iterate over strings."""
535
    
536
    def __len__(self) -> int:
537
        """Number of strings."""
538
    
539
    def add(self, string: str) -> int:
540
        """Add string and return ID."""
541
    
542
    # Serialization
543
    def to_disk(self, path: str) -> None:
544
        """Save string store to disk."""
545
    
546
    def from_disk(self, path: str) -> 'StringStore':
547
        """Load string store from disk."""
548
    
549
    def to_bytes(self) -> bytes:
550
        """Serialize to bytes."""
551
    
552
    def from_bytes(self, bytes_data: bytes) -> 'StringStore':
553
        """Deserialize from bytes."""
554
```
555

556
## Usage Examples
557

558
### Processing Documents
559

560
```python
561
import spacy
562

563
nlp = spacy.load("en_core_web_sm")
564

565
# Process single document
566
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
567

568
# Access document properties
569
print(f"Text: {doc.text}")
570
print(f"Number of tokens: {len(doc)}")
571
print(f"Number of sentences: {len(list(doc.sents))}")
572

573
# Iterate over tokens
574
for token in doc:
575
    print(f"{token.text}: {token.pos_} ({token.lemma_})")
576

577
# Process multiple documents efficiently
578
texts = ["First document", "Second document", "Third document"]
579
for doc in nlp.pipe(texts):
580
    print(f"Processed: {doc.text}")
581
```
582

583
### Working with Spans
584

585
```python
586
# Create custom spans
587
doc = nlp("Apple is looking at buying U.K. startup")
588
company_span = doc[0:1]  # "Apple"
589
target_span = doc[4:7]   # "U.K. startup"
590

591
# Named entity spans
592
for ent in doc.ents:
593
    print(f"Entity: {ent.text} ({ent.label_})")
594
    print(f"Start: {ent.start}, End: {ent.end}")
595

596
# Create span from character positions
597
char_span = doc.char_span(0, 5, label="ORG")  # "Apple"
598
if char_span:
599
    print(f"Character span: {char_span.text}")
600
```
601

602
### Vocabulary Operations
603

604
```python
605
# Access vocabulary
606
vocab = nlp.vocab
607

608
# Get lexeme
609
apple_lexeme = vocab["apple"]
610
print(f"Is alpha: {apple_lexeme.is_alpha}")
611
print(f"Is stop word: {apple_lexeme.is_stop}")
612

613
# String store operations
614
string_id = vocab.strings.add("custom_token")
615
retrieved_string = vocab.strings[string_id]
616
print(f"String ID: {string_id}, Retrieved: {retrieved_string}")
617
```

Version

Tile

Files

core-objects.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

core-objects.mddocs/