or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-objects.mdindex.mdlanguage-models.mdpattern-matching.mdpipeline-components.mdtraining.mdvisualization.md

pipeline-components.mddocs/

0

# Pipeline Components

1

2

Built-in pipeline components that perform linguistic analysis on documents. These components can be combined in customizable processing pipelines to add part-of-speech tags, dependency parsing, named entity recognition, text classification, and more.

3

4

## Capabilities

5

6

### Part-of-Speech Tagging

7

8

Statistical models that assign part-of-speech tags and morphological features to tokens based on context and linguistic patterns.

9

10

```python { .api }

11

class Tagger:

12

"""Part-of-speech tagger pipeline component."""

13

14

name: str = "tagger"

15

16

def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:

17

"""Initialize the tagger."""

18

19

def __call__(self, doc: Doc) -> Doc:

20

"""Apply the tagger to a Doc object."""

21

22

def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:

23

"""Process documents in batches."""

24

25

def predict(self, docs: List[Doc]) -> Scores:

26

"""Predict part-of-speech tags for documents."""

27

28

def set_annotations(self, docs: List[Doc], scores: Scores) -> None:

29

"""Set part-of-speech annotations on documents."""

30

31

def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:

32

"""Update the model with training examples."""

33

34

def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:

35

"""Initialize training."""

36

37

def add_label(self, label: str) -> int:

38

"""Add a label to the component."""

39

40

# Serialization

41

def to_disk(self, path: str, exclude: List[str] = None) -> None:

42

"""Save the component to disk."""

43

44

def from_disk(self, path: str, exclude: List[str] = None) -> 'Tagger':

45

"""Load the component from disk."""

46

```

47

48

### Dependency Parsing

49

50

Statistical parser that predicts syntactic dependencies between tokens, creating a dependency tree structure.

51

52

```python { .api }

53

class DependencyParser:

54

"""Dependency parser pipeline component."""

55

56

name: str = "parser"

57

58

def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:

59

"""Initialize the parser."""

60

61

def __call__(self, doc: Doc) -> Doc:

62

"""Apply the parser to a Doc object."""

63

64

def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:

65

"""Process documents in batches."""

66

67

def predict(self, docs: List[Doc]) -> Scores:

68

"""Predict dependency relations for documents."""

69

70

def set_annotations(self, docs: List[Doc], scores: Scores) -> None:

71

"""Set dependency annotations on documents."""

72

73

def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:

74

"""Update the model with training examples."""

75

76

def add_label(self, label: str) -> int:

77

"""Add a dependency label."""

78

79

# Serialization methods similar to Tagger

80

```

81

82

### Named Entity Recognition

83

84

Statistical model that identifies and classifies named entities (people, organizations, locations, etc.) in text.

85

86

```python { .api }

87

class EntityRecognizer:

88

"""Named entity recognition pipeline component."""

89

90

name: str = "ner"

91

92

def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:

93

"""Initialize the NER component."""

94

95

def __call__(self, doc: Doc) -> Doc:

96

"""Apply NER to a Doc object."""

97

98

def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:

99

"""Process documents in batches."""

100

101

def predict(self, docs: List[Doc]) -> Scores:

102

"""Predict named entities for documents."""

103

104

def set_annotations(self, docs: List[Doc], scores: Scores) -> None:

105

"""Set named entity annotations on documents."""

106

107

def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:

108

"""Update the model with training examples."""

109

110

def add_label(self, label: str) -> int:

111

"""Add an entity label."""

112

113

# Serialization methods similar to Tagger

114

```

115

116

### Text Classification

117

118

Multi-label text classifier that assigns category scores to documents based on their content.

119

120

```python { .api }

121

class TextCategorizer:

122

"""Text classification pipeline component."""

123

124

name: str = "textcat"

125

126

def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:

127

"""Initialize the text categorizer."""

128

129

def __call__(self, doc: Doc) -> Doc:

130

"""Apply text categorization to a Doc object."""

131

132

def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:

133

"""Process documents in batches."""

134

135

def predict(self, docs: List[Doc]) -> Scores:

136

"""Predict category scores for documents."""

137

138

def set_annotations(self, docs: List[Doc], scores: Scores) -> None:

139

"""Set category annotations on documents."""

140

141

def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:

142

"""Update the model with training examples."""

143

144

def add_label(self, label: str) -> int:

145

"""Add a category label."""

146

147

@property

148

def labels(self) -> tuple:

149

"""Get category labels."""

150

```

151

152

### Entity Linking

153

154

Component that links named entities to entries in a knowledge base using entity embeddings and candidate ranking.

155

156

```python { .api }

157

class EntityLinker:

158

"""Entity linking pipeline component."""

159

160

name: str = "entity_linker"

161

162

def __init__(self, vocab: Vocab, **cfg) -> None:

163

"""Initialize the entity linker."""

164

165

def __call__(self, doc: Doc) -> Doc:

166

"""Apply entity linking to a Doc object."""

167

168

def predict(self, docs: List[Doc]) -> Scores:

169

"""Predict entity links for documents."""

170

171

def set_annotations(self, docs: List[Doc], scores: Scores) -> None:

172

"""Set entity linking annotations on documents."""

173

174

def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:

175

"""Update the model with training examples."""

176

177

def add_label(self, label: str) -> int:

178

"""Add an entity type label."""

179

180

def get_candidates(self, mention: Span) -> List:

181

"""Get knowledge base candidates for a mention."""

182

```

183

184

### Morphological Analysis

185

186

Component that analyzes word morphology and assigns detailed morphological features to tokens.

187

188

```python { .api }

189

class Morphologizer:

190

"""Morphological analysis pipeline component."""

191

192

name: str = "morphologizer"

193

194

def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:

195

"""Initialize the morphologizer."""

196

197

def __call__(self, doc: Doc) -> Doc:

198

"""Apply morphological analysis to a Doc object."""

199

200

def predict(self, docs: List[Doc]) -> Scores:

201

"""Predict morphological features for documents."""

202

203

def set_annotations(self, docs: List[Doc], scores: Scores) -> None:

204

"""Set morphological annotations on documents."""

205

206

def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:

207

"""Update the model with training examples."""

208

```

209

210

### Rule-Based Components

211

212

#### Entity Ruler

213

214

Rule-based component for pattern-based named entity recognition using token patterns or phrase matching.

215

216

```python { .api }

217

class EntityRuler:

218

"""Rule-based named entity recognition component."""

219

220

name: str = "entity_ruler"

221

222

def __init__(self, nlp: Language, patterns: List[dict] = None,

223

overwrite_ents: bool = False, **cfg) -> None:

224

"""Initialize the entity ruler."""

225

226

def __call__(self, doc: Doc) -> Doc:

227

"""Apply entity rules to a Doc object."""

228

229

def add_patterns(self, patterns: List[dict]) -> None:

230

"""Add patterns to the entity ruler."""

231

232

@property

233

def patterns(self) -> List[dict]:

234

"""Get all patterns."""

235

236

@property

237

def labels(self) -> set:

238

"""Get entity labels."""

239

240

# Serialization

241

def to_disk(self, path: str, exclude: List[str] = None) -> None:

242

"""Save patterns to disk."""

243

244

def from_disk(self, path: str, exclude: List[str] = None) -> 'EntityRuler':

245

"""Load patterns from disk."""

246

```

247

248

#### Sentence Boundary Detection

249

250

Fast, rule-based sentence boundary detection for most languages.

251

252

```python { .api }

253

class Sentencizer:

254

"""Rule-based sentence boundary detection component."""

255

256

name: str = "sentencizer"

257

258

def __init__(self, punct_chars: Set[str] = None, **cfg) -> None:

259

"""Initialize the sentencizer."""

260

261

def __call__(self, doc: Doc) -> Doc:

262

"""Apply sentence boundary detection to a Doc object."""

263

264

def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:

265

"""Process documents in batches."""

266

```

267

268

### Pipeline Management Functions

269

270

Functions for merging tokens based on linguistic analysis.

271

272

```python { .api }

273

def merge_entities(doc: Doc) -> Doc:

274

"""

275

Merge named entity tokens into single tokens.

276

277

Args:

278

doc: The Doc object to modify

279

280

Returns:

281

The modified Doc object

282

"""

283

284

def merge_noun_chunks(doc: Doc) -> Doc:

285

"""

286

Merge noun chunk tokens into single tokens.

287

288

Args:

289

doc: The Doc object to modify

290

291

Returns:

292

The modified Doc object

293

"""

294

295

def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:

296

"""

297

Merge subtokens into single tokens.

298

299

Args:

300

doc: The Doc object to modify

301

label: Label for merged tokens

302

303

Returns:

304

The modified Doc object

305

"""

306

```

307

308

### Base Pipeline Component

309

310

Abstract base class for creating custom pipeline components.

311

312

```python { .api }

313

class Pipe:

314

"""Base class for pipeline components."""

315

316

name: str

317

318

def __call__(self, doc: Doc) -> Doc:

319

"""Apply the component to a Doc object."""

320

raise NotImplementedError

321

322

def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:

323

"""Process documents in batches."""

324

for docs in util.minibatch(stream, size=batch_size):

325

for doc in docs:

326

yield self(doc)

327

328

def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:

329

"""Update the component with training examples."""

330

pass

331

332

def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:

333

"""Initialize training."""

334

pass

335

```

336

337

## Usage Examples

338

339

### Using Built-in Components

340

341

```python

342

import spacy

343

344

# Load model with multiple components

345

nlp = spacy.load("en_core_web_sm")

346

print("Pipeline components:", nlp.pipe_names)

347

# Output: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

348

349

# Process text through all components

350

doc = nlp("Apple Inc. is looking at buying U.K. startup for $1 billion")

351

352

# Access tagger results

353

for token in doc:

354

print(f"{token.text}: {token.pos_} ({token.tag_})")

355

356

# Access parser results

357

for token in doc:

358

print(f"{token.text} -> {token.head.text} ({token.dep_})")

359

360

# Access NER results

361

for ent in doc.ents:

362

print(f"{ent.text}: {ent.label_}")

363

```

364

365

### Pipeline Management

366

367

```python

368

import spacy

369

from spacy.pipeline import EntityRuler

370

371

# Create blank language model

372

nlp = spacy.blank("en")

373

374

# Add components to pipeline

375

nlp.add_pipe("tagger")

376

nlp.add_pipe("parser")

377

nlp.add_pipe("ner")

378

379

# Add custom rule-based component

380

ruler = EntityRuler(nlp, patterns=[

381

{"label": "COMPANY", "pattern": "Apple Inc."},

382

{"label": "COMPANY", "pattern": "Microsoft Corp."}

383

])

384

nlp.add_pipe(ruler, before="ner")

385

386

# Process text

387

doc = nlp("Apple Inc. and Microsoft Corp. are tech companies")

388

for ent in doc.ents:

389

print(f"{ent.text}: {ent.label_}")

390

```

391

392

### Disabling Components

393

394

```python

395

import spacy

396

397

nlp = spacy.load("en_core_web_sm")

398

399

# Disable specific components for faster processing

400

with nlp.disable_pipes("parser", "ner"):

401

doc = nlp("This will only run tokenizer and tagger")

402

403

# Process multiple documents with disabled components

404

texts = ["Text one", "Text two", "Text three"]

405

with nlp.disable_pipes("parser"):

406

docs = list(nlp.pipe(texts))

407

```

408

409

### Custom Pipeline Components

410

411

```python

412

from spacy.pipeline import Pipe

413

from spacy.tokens import Doc

414

415

class CustomComponent(Pipe):

416

"""Custom pipeline component example."""

417

418

name = "custom_component"

419

420

def __call__(self, doc):

421

# Add custom processing logic

422

for token in doc:

423

if token.like_email:

424

token._.is_email = True

425

return doc

426

427

# Register and add to pipeline

428

@spacy.component("custom_component")

429

def create_custom_component(nlp, name):

430

return CustomComponent()

431

432

nlp = spacy.blank("en")

433

nlp.add_pipe("custom_component")

434

```

435

436

### Text Classification

437

438

```python

439

import spacy

440

441

# Load model with text classifier

442

nlp = spacy.load("en_core_web_sm")

443

444

# Add text categorizer

445

textcat = nlp.add_pipe("textcat")

446

textcat.add_label("POSITIVE")

447

textcat.add_label("NEGATIVE")

448

449

# After training...

450

doc = nlp("This movie is great!")

451

print("Categories:", doc.cats)

452

# Output: {'POSITIVE': 0.9, 'NEGATIVE': 0.1}

453

```