or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-objects.mdindex.mdlanguage-models.mdpattern-matching.mdpipeline-components.mdtraining.mdvisualization.md

core-objects.mddocs/

0

# Core Processing Objects

1

2

The fundamental objects for text processing in spaCy. These classes form the foundation of all NLP operations and provide access to linguistic annotations, document structure, and vocabulary management.

3

4

## Capabilities

5

6

### Language Pipeline

7

8

The main entry point for NLP processing. The Language class manages the processing pipeline and provides methods for processing single texts or batches efficiently.

9

10

```python { .api }

11

class Language:

12

"""Main NLP pipeline class that processes text through pipeline components."""

13

14

vocab: Vocab

15

pipeline: List[tuple]

16

pipe_names: List[str]

17

meta: dict

18

19

def __call__(self, text: str) -> Doc:

20

"""Process a single text and return a Doc object."""

21

22

def pipe(self, texts: Iterable[str],

23

batch_size: int = 1000,

24

disable: List[str] = None,

25

component_cfg: dict = None,

26

n_process: int = 1) -> Iterator[Doc]:

27

"""Process multiple texts efficiently."""

28

29

def update(self, examples: List, sgd=None, **kwargs) -> dict:

30

"""Update the model with training examples."""

31

32

def begin_training(self, get_examples=None, **kwargs) -> Optimizer:

33

"""Initialize training and return optimizer."""

34

35

def evaluate(self, examples: List, **kwargs) -> dict:

36

"""Evaluate the model on examples."""

37

38

# Pipeline management

39

def add_pipe(self, component, name: str = None,

40

before: str = None, after: str = None,

41

first: bool = False, last: bool = False) -> callable:

42

"""Add a component to the processing pipeline."""

43

44

def remove_pipe(self, name: str) -> tuple:

45

"""Remove a component from the pipeline."""

46

47

def get_pipe(self, name: str) -> callable:

48

"""Get a pipeline component by name."""

49

50

def has_pipe(self, name: str) -> bool:

51

"""Check if pipeline has a component."""

52

53

def disable_pipes(self, *names) -> ContextManager:

54

"""Temporarily disable pipeline components."""

55

56

# Serialization

57

def to_disk(self, path: str, exclude: List[str] = None) -> None:

58

"""Save the model to disk."""

59

60

def from_disk(self, path: str, exclude: List[str] = None) -> 'Language':

61

"""Load the model from disk."""

62

63

def to_bytes(self, exclude: List[str] = None) -> bytes:

64

"""Serialize the model to bytes."""

65

66

def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Language':

67

"""Load the model from bytes."""

68

```

69

70

### Document Container

71

72

The Doc class represents a document with token-level and document-level annotations. It provides access to the parsed text structure and linguistic analysis.

73

74

```python { .api }

75

class Doc:

76

"""Container for accessing linguistic annotations on a document."""

77

78

text: str

79

text_with_ws: str

80

ents: tuple

81

noun_chunks: Iterator

82

sents: Iterator

83

vector: numpy.ndarray

84

lang_: str

85

is_parsed: bool

86

is_tagged: bool

87

is_sentenced: bool

88

89

def __init__(self, vocab: Vocab, words: List[str] = None,

90

spaces: List[bool] = None) -> None:

91

"""Create a Doc object."""

92

93

def __getitem__(self, i: Union[int, slice]) -> Union[Token, Span]:

94

"""Get a token or span."""

95

96

def __iter__(self) -> Iterator[Token]:

97

"""Iterate over tokens."""

98

99

def __len__(self) -> int:

100

"""Number of tokens."""

101

102

def similarity(self, other: Union['Doc', 'Span', 'Token']) -> float:

103

"""Compute semantic similarity."""

104

105

def char_span(self, start: int, end: int,

106

label: str = None, kb_id: str = None) -> Span:

107

"""Create a Span from character positions."""

108

109

def count_by(self, attr: int, exclude: Set = None) -> dict:

110

"""Count tokens by attribute."""

111

112

def to_json(self, underscore: List[str] = None) -> dict:

113

"""Export to JSON format."""

114

115

def retokenize(self) -> ContextManager:

116

"""Context manager for merging/splitting tokens."""

117

118

# Serialization

119

def to_disk(self, path: str, exclude: List[str] = None) -> None:

120

"""Save the doc to disk."""

121

122

def from_disk(self, path: str, exclude: List[str] = None) -> 'Doc':

123

"""Load the doc from disk."""

124

125

def to_bytes(self, exclude: List[str] = None) -> bytes:

126

"""Serialize the doc to bytes."""

127

128

def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Doc':

129

"""Load the doc from bytes."""

130

```

131

132

### Token Annotations

133

134

Individual tokens with comprehensive linguistic annotations including morphology, syntax, and semantic properties.

135

136

```python { .api }

137

class Token:

138

"""Individual token with linguistic annotations."""

139

140

# Text properties

141

text: str

142

text_with_ws: str

143

whitespace_: str

144

orth: int

145

orth_: str

146

147

# Linguistic annotations

148

lemma: int

149

lemma_: str

150

pos: int

151

pos_: str

152

tag: int

153

tag_: str

154

dep: int

155

dep_: str

156

157

# Morphological features

158

morph: MorphAnalysis

159

160

# Named entity information

161

ent_type: int

162

ent_type_: str

163

ent_iob: int

164

ent_iob_: str

165

ent_kb_id: int

166

ent_kb_id_: str

167

ent_id: int

168

ent_id_: str

169

170

# Syntactic relationships

171

head: 'Token'

172

children: Iterator['Token']

173

ancestors: Iterator['Token']

174

subtree: Iterator['Token']

175

lefts: Iterator['Token']

176

rights: Iterator['Token']

177

n_lefts: int

178

n_rights: int

179

180

# Boolean flags

181

is_alpha: bool

182

is_ascii: bool

183

is_digit: bool

184

is_lower: bool

185

is_upper: bool

186

is_title: bool

187

is_punct: bool

188

is_space: bool

189

is_bracket: bool

190

is_quote: bool

191

is_stop: bool

192

like_url: bool

193

like_num: bool

194

like_email: bool

195

196

# Vector representation

197

vector: numpy.ndarray

198

has_vector: bool

199

vector_norm: float

200

201

def similarity(self, other: Union['Token', 'Span', 'Doc']) -> float:

202

"""Compute semantic similarity."""

203

204

def nbor(self, i: int = 1) -> 'Token':

205

"""Get neighboring token."""

206

207

def is_ancestor(self, descendant: 'Token') -> bool:

208

"""Check if token is ancestor of another."""

209

```

210

211

### Span Objects

212

213

Spans represent slices of documents, typically used for named entities, noun chunks, or custom text segments.

214

215

```python { .api }

216

class Span:

217

"""Slice of a document with optional label and attributes."""

218

219

text: str

220

text_with_ws: str

221

label: int

222

label_: str

223

kb_id: int

224

kb_id_: str

225

ent_id: int

226

ent_id_: str

227

228

start: int

229

end: int

230

start_char: int

231

end_char: int

232

233

vector: numpy.ndarray

234

235

doc: Doc

236

sent: 'Span'

237

root: Token

238

ents: tuple

239

240

def __init__(self, doc: Doc, start: int, end: int,

241

label: int = 0, kb_id: int = 0) -> None:

242

"""Create a Span object."""

243

244

def __getitem__(self, i: Union[int, slice]) -> Union[Token, 'Span']:

245

"""Get token or subspan."""

246

247

def __iter__(self) -> Iterator[Token]:

248

"""Iterate over tokens."""

249

250

def __len__(self) -> int:

251

"""Number of tokens in span."""

252

253

def similarity(self, other: Union['Span', 'Doc', 'Token']) -> float:

254

"""Compute semantic similarity."""

255

256

def as_doc(self) -> Doc:

257

"""Create a new Doc object from the span."""

258

259

def char_span(self, start: int, end: int,

260

label: str = None, kb_id: str = None) -> 'Span':

261

"""Create a subspan from character positions."""

262

263

def conjuncts(self) -> List['Span']:

264

"""Get conjunct spans."""

265

```

266

267

### Vocabulary Management

268

269

The vocabulary stores all strings, word vectors, and lexical entries used by the language model.

270

271

```python { .api }

272

class Vocab:

273

"""Vocabulary store for strings, vectors, and lexical entries."""

274

275

strings: StringStore

276

vectors: Vectors

277

lookups: Lookups

278

writing_system: dict

279

280

def __init__(self, lex_attr_getters: dict = None,

281

strings: StringStore = None,

282

lookups: Lookups = None,

283

oov_prob: float = -20.0) -> None:

284

"""Create a vocabulary."""

285

286

def __getitem__(self, id_or_string: Union[int, str]) -> Lexeme:

287

"""Get a lexeme."""

288

289

def __iter__(self) -> Iterator[Lexeme]:

290

"""Iterate over lexemes."""

291

292

def __len__(self) -> int:

293

"""Number of lexemes."""

294

295

def __contains__(self, string: str) -> bool:

296

"""Check if string is in vocabulary."""

297

298

def add_flag(self, flag_getter: callable, flag_id: int = None) -> int:

299

"""Add a boolean flag attribute."""

300

301

def get_vector(self, orth: Union[int, str]) -> numpy.ndarray:

302

"""Get word vector."""

303

304

def set_vector(self, orth: Union[int, str], vector: numpy.ndarray) -> None:

305

"""Set word vector."""

306

307

def has_vector(self, orth: Union[int, str]) -> bool:

308

"""Check if word has vector."""

309

310

# Serialization

311

def to_disk(self, path: str, exclude: List[str] = None) -> None:

312

"""Save vocabulary to disk."""

313

314

def from_disk(self, path: str, exclude: List[str] = None) -> 'Vocab':

315

"""Load vocabulary from disk."""

316

```

317

318

### Lexeme Objects

319

320

Lexemes store word-type information in the vocabulary, independent of context.

321

322

```python { .api }

323

class Lexeme:

324

"""Word type stored in vocabulary."""

325

326

# Text properties

327

orth: int

328

orth_: str

329

text: str

330

lower: int

331

lower_: str

332

norm: int

333

norm_: str

334

shape: int

335

shape_: str

336

prefix: int

337

prefix_: str

338

suffix: int

339

suffix_: str

340

341

# Boolean flags

342

is_alpha: bool

343

is_ascii: bool

344

is_digit: bool

345

is_lower: bool

346

is_upper: bool

347

is_title: bool

348

is_punct: bool

349

is_space: bool

350

is_bracket: bool

351

is_quote: bool

352

is_stop: bool

353

like_url: bool

354

like_num: bool

355

like_email: bool

356

357

# Vector representation

358

vector: numpy.ndarray

359

has_vector: bool

360

vector_norm: float

361

362

# Probability and sentiment

363

prob: float

364

sentiment: float

365

366

def similarity(self, other: Union['Lexeme', 'Token']) -> float:

367

"""Compute semantic similarity."""

368

```

369

370

### Document Collections

371

372

Efficient storage and serialization for multiple documents.

373

374

```python { .api }

375

class DocBin:

376

"""Efficient storage for multiple Doc objects."""

377

378

def __init__(self, attrs: List[str] = None, store_user_data: bool = False) -> None:

379

"""Create a DocBin for storing multiple documents."""

380

381

def __len__(self) -> int:

382

"""Number of documents in the collection."""

383

384

def add(self, doc: Doc) -> None:

385

"""Add a Doc object to the collection."""

386

387

def get_docs(self, vocab: Vocab) -> Iterator[Doc]:

388

"""Retrieve Doc objects from the collection."""

389

390

def merge(self, other: 'DocBin') -> None:

391

"""Merge another DocBin into this one."""

392

393

# Serialization

394

def to_disk(self, path: str) -> None:

395

"""Save the DocBin to disk."""

396

397

def from_disk(self, path: str) -> 'DocBin':

398

"""Load the DocBin from disk."""

399

400

def to_bytes(self) -> bytes:

401

"""Serialize to bytes."""

402

403

def from_bytes(self, bytes_data: bytes) -> 'DocBin':

404

"""Deserialize from bytes."""

405

```

406

407

### Document Modification

408

409

Tools for modifying document tokenization after initial processing.

410

411

```python { .api }

412

class Retokenizer:

413

"""Context manager for modifying document tokenization."""

414

415

def merge(self, span: Span, attrs: dict = None) -> None:

416

"""

417

Merge a span into a single token.

418

419

Args:

420

span: The span to merge

421

attrs: Optional token attributes for merged token

422

"""

423

424

def split(self, token: Token, orths: List[str],

425

heads: List[tuple] = None, attrs: dict = None) -> None:

426

"""

427

Split a token into multiple tokens.

428

429

Args:

430

token: The token to split

431

orths: List of orthographic forms for new tokens

432

heads: List of (head_index, dep_label) tuples

433

attrs: Optional token attributes

434

"""

435

```

436

437

### Morphological Analysis

438

439

Container for morphological feature analysis.

440

441

```python { .api }

442

class MorphAnalysis:

443

"""Morphological analysis container."""

444

445

def __init__(self, vocab: Vocab, features: dict = None) -> None:

446

"""Create morphological analysis."""

447

448

def __str__(self) -> str:

449

"""String representation of morphological features."""

450

451

def get(self, field: str) -> List[str]:

452

"""Get values for a morphological field."""

453

454

def to_dict(self) -> dict:

455

"""Convert to dictionary format."""

456

457

@classmethod

458

def from_id(cls, vocab: Vocab, key: int) -> 'MorphAnalysis':

459

"""Create from vocabulary ID."""

460

```

461

462

### Lookup Tables

463

464

Management system for linguistic lookup tables and data.

465

466

```python { .api }

467

class Lookups:

468

"""Lookup table management system."""

469

470

def __init__(self) -> None:

471

"""Create empty lookup tables."""

472

473

def add_table(self, name: str, data: dict = None) -> dict:

474

"""Add a lookup table."""

475

476

def get_table(self, name: str, default: dict = None) -> dict:

477

"""Get a lookup table by name."""

478

479

def has_table(self, name: str) -> bool:

480

"""Check if table exists."""

481

482

def remove_table(self, name: str) -> dict:

483

"""Remove and return a table."""

484

485

def to_disk(self, path: str, exclude: List[str] = None) -> None:

486

"""Save lookup tables to disk."""

487

488

def from_disk(self, path: str, exclude: List[str] = None) -> 'Lookups':

489

"""Load lookup tables from disk."""

490

```

491

492

### Lemmatization

493

494

System for reducing words to their lemmatized forms.

495

496

```python { .api }

497

class Lemmatizer:

498

"""Lemmatization component."""

499

500

def __init__(self, lookups: Lookups = None, rules: dict = None) -> None:

501

"""Initialize lemmatizer."""

502

503

def lookup(self, string: str, pos: str = None, morphs: dict = None) -> List[str]:

504

"""Look up lemma in tables."""

505

506

def rule_lookup(self, string: str, pos: str) -> List[str]:

507

"""Apply lemmatization rules."""

508

509

def lookup_table(self, string: str, table: str) -> List[str]:

510

"""Look up in specific table."""

511

512

def is_base_form(self, univ_pos: str, morphs: dict = None) -> bool:

513

"""Check if token is in base form."""

514

```

515

516

### String Store

517

518

Efficient bidirectional mapping between strings and integer IDs.

519

520

```python { .api }

521

class StringStore:

522

"""Bidirectional map between strings and integer IDs."""

523

524

def __init__(self, strings: Iterable[str] = None) -> None:

525

"""Create a string store."""

526

527

def __getitem__(self, id_or_string: Union[int, str]) -> Union[str, int]:

528

"""Get string by ID or ID by string."""

529

530

def __contains__(self, string: str) -> bool:

531

"""Check if string is in store."""

532

533

def __iter__(self) -> Iterator[str]:

534

"""Iterate over strings."""

535

536

def __len__(self) -> int:

537

"""Number of strings."""

538

539

def add(self, string: str) -> int:

540

"""Add string and return ID."""

541

542

# Serialization

543

def to_disk(self, path: str) -> None:

544

"""Save string store to disk."""

545

546

def from_disk(self, path: str) -> 'StringStore':

547

"""Load string store from disk."""

548

549

def to_bytes(self) -> bytes:

550

"""Serialize to bytes."""

551

552

def from_bytes(self, bytes_data: bytes) -> 'StringStore':

553

"""Deserialize from bytes."""

554

```

555

556

## Usage Examples

557

558

### Processing Documents

559

560

```python

561

import spacy

562

563

nlp = spacy.load("en_core_web_sm")

564

565

# Process single document

566

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

567

568

# Access document properties

569

print(f"Text: {doc.text}")

570

print(f"Number of tokens: {len(doc)}")

571

print(f"Number of sentences: {len(list(doc.sents))}")

572

573

# Iterate over tokens

574

for token in doc:

575

print(f"{token.text}: {token.pos_} ({token.lemma_})")

576

577

# Process multiple documents efficiently

578

texts = ["First document", "Second document", "Third document"]

579

for doc in nlp.pipe(texts):

580

print(f"Processed: {doc.text}")

581

```

582

583

### Working with Spans

584

585

```python

586

# Create custom spans

587

doc = nlp("Apple is looking at buying U.K. startup")

588

company_span = doc[0:1] # "Apple"

589

target_span = doc[4:7] # "U.K. startup"

590

591

# Named entity spans

592

for ent in doc.ents:

593

print(f"Entity: {ent.text} ({ent.label_})")

594

print(f"Start: {ent.start}, End: {ent.end}")

595

596

# Create span from character positions

597

char_span = doc.char_span(0, 5, label="ORG") # "Apple"

598

if char_span:

599

print(f"Character span: {char_span.text}")

600

```

601

602

### Vocabulary Operations

603

604

```python

605

# Access vocabulary

606

vocab = nlp.vocab

607

608

# Get lexeme

609

apple_lexeme = vocab["apple"]

610

print(f"Is alpha: {apple_lexeme.is_alpha}")

611

print(f"Is stop word: {apple_lexeme.is_stop}")

612

613

# String store operations

614

string_id = vocab.strings.add("custom_token")

615

retrieved_string = vocab.strings[string_id]

616

print(f"String ID: {string_id}, Retrieved: {retrieved_string}")

617

```