or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

index.mdmodels-tokenizers.mdpipelines.mdprocessors.mdutilities.md

models-tokenizers.mddocs/

0

# Models and Tokenizers

1

2

This module provides Auto classes for automatic model and tokenizer selection, as well as direct access to specific model implementations for fine-grained control over model loading and inference.

3

4

## Capabilities

5

6

### Auto Classes - Automatic Selection

7

8

Auto classes automatically select the appropriate model or tokenizer implementation based on the model configuration, providing the most convenient interface for most use cases.

9

10

#### AutoModel

11

12

Loads the appropriate model architecture automatically based on the model configuration.

13

14

```javascript { .api }

15

/**

16

* Instantiate a pretrained model automatically based on the model type

17

* @param pretrained_model_name_or_path - Model identifier or path

18

* @param options - Configuration options for model loading

19

* @returns Promise resolving to the appropriate model instance

20

*/

21

class AutoModel {

22

static async from_pretrained(

23

pretrained_model_name_or_path: string,

24

options?: ModelOptions

25

): Promise<PreTrainedModel>;

26

}

27

28

interface ModelOptions {

29

/** Use quantized version of the model (default: true) */

30

quantized?: boolean;

31

/** Callback to track model download progress */

32

progress_callback?: (progress: any) => void;

33

/** Custom model configuration */

34

config?: any;

35

/** Directory to cache downloaded models */

36

cache_dir?: string;

37

/** Only use local files, don't download from remote */

38

local_files_only?: boolean;

39

/** Model revision/branch to use (default: 'main') */

40

revision?: string;

41

/** Specific model file name to use */

42

model_file_name?: string;

43

}

44

```

45

46

**Usage Example:**

47

48

```javascript

49

import { AutoModel, AutoTokenizer } from "@xenova/transformers";

50

51

const tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased");

52

const model = await AutoModel.from_pretrained("Xenova/bert-base-uncased");

53

54

const inputs = await tokenizer("I love transformers!");

55

const outputs = await model(inputs);

56

```

57

58

#### AutoConfig

59

60

Automatically loads model configuration from pretrained models.

61

62

```javascript { .api }

63

/**

64

* Load model configuration automatically

65

* @param pretrained_model_name_or_path - Model identifier or path

66

* @param options - Configuration options for loading

67

* @returns Promise resolving to model configuration

68

*/

69

class AutoConfig {

70

static async from_pretrained(

71

pretrained_model_name_or_path: string,

72

options?: ConfigOptions

73

): Promise<PretrainedConfig>;

74

}

75

76

interface ConfigOptions {

77

/** Directory to cache downloaded files */

78

cache_dir?: string;

79

/** Only use local files, don't download from remote */

80

local_files_only?: boolean;

81

/** Model revision/branch to use (default: 'main') */

82

revision?: string;

83

}

84

85

interface PretrainedConfig {

86

model_type: string;

87

architectures?: string[];

88

vocab_size?: number;

89

hidden_size?: number;

90

num_attention_heads?: number;

91

num_hidden_layers?: number;

92

max_position_embeddings?: number;

93

[key: string]: any;

94

}

95

```

96

97

**Usage Example:**

98

99

```javascript

100

import { AutoConfig } from "@xenova/transformers";

101

102

const config = await AutoConfig.from_pretrained("Xenova/bert-base-uncased");

103

console.log(config.model_type); // "bert"

104

console.log(config.vocab_size); // 30522

105

```

106

107

#### AutoTokenizer

108

109

Automatically selects and loads the appropriate tokenizer based on the tokenizer configuration.

110

111

```javascript { .api }

112

/**

113

* Instantiate a tokenizer automatically based on the tokenizer type

114

* @param pretrained_model_name_or_path - Model identifier or path

115

* @param options - Configuration options for tokenizer loading

116

* @returns Promise resolving to the appropriate tokenizer instance

117

*/

118

class AutoTokenizer {

119

static async from_pretrained(

120

pretrained_model_name_or_path: string,

121

options?: TokenizerOptions

122

): Promise<PreTrainedTokenizer>;

123

}

124

125

interface TokenizerOptions {

126

/** Use quantized version (default: true) */

127

quantized?: boolean;

128

/** Callback to track download progress */

129

progress_callback?: (progress: any) => void;

130

/** Custom configuration */

131

config?: any;

132

/** Directory to cache downloaded files */

133

cache_dir?: string;

134

/** Only use local files, don't download from remote */

135

local_files_only?: boolean;

136

/** Model revision/branch to use (default: 'main') */

137

revision?: string;

138

/** Whether to use legacy tokenizer behavior */

139

legacy?: boolean;

140

}

141

```

142

143

**Usage Example:**

144

145

```javascript

146

import { AutoTokenizer } from "@xenova/transformers";

147

148

const tokenizer = await AutoTokenizer.from_pretrained("Xenova/t5-small");

149

const inputs = await tokenizer("translate English to German: Hello world");

150

const decoded = tokenizer.decode(inputs.input_ids[0]);

151

```

152

153

### Task-Specific Auto Model Classes

154

155

These classes automatically load models optimized for specific tasks:

156

157

#### Text Processing Models

158

159

```javascript { .api }

160

class AutoModelForSequenceClassification {

161

static async from_pretrained(

162

pretrained_model_name_or_path: string,

163

options?: ModelOptions

164

): Promise<PreTrainedModel>;

165

}

166

167

class AutoModelForTokenClassification {

168

static async from_pretrained(

169

pretrained_model_name_or_path: string,

170

options?: ModelOptions

171

): Promise<PreTrainedModel>;

172

}

173

174

class AutoModelForQuestionAnswering {

175

static async from_pretrained(

176

pretrained_model_name_or_path: string,

177

options?: ModelOptions

178

): Promise<PreTrainedModel>;

179

}

180

181

class AutoModelForMaskedLM {

182

static async from_pretrained(

183

pretrained_model_name_or_path: string,

184

options?: ModelOptions

185

): Promise<PreTrainedModel>;

186

}

187

188

class AutoModelForCausalLM {

189

static async from_pretrained(

190

pretrained_model_name_or_path: string,

191

options?: ModelOptions

192

): Promise<PreTrainedModel>;

193

}

194

```

195

196

#### Sequence-to-Sequence Models

197

198

```javascript { .api }

199

class AutoModelForSeq2SeqLM {

200

static async from_pretrained(

201

pretrained_model_name_or_path: string,

202

options?: ModelOptions

203

): Promise<PreTrainedModel>;

204

}

205

206

class AutoModelForVision2Seq {

207

static async from_pretrained(

208

pretrained_model_name_or_path: string,

209

options?: ModelOptions

210

): Promise<PreTrainedModel>;

211

}

212

```

213

214

#### Vision Models

215

216

```javascript { .api }

217

class AutoModelForImageClassification {

218

static async from_pretrained(

219

pretrained_model_name_or_path: string,

220

options?: ModelOptions

221

): Promise<PreTrainedModel>;

222

}

223

224

class AutoModelForImageSegmentation {

225

static async from_pretrained(

226

pretrained_model_name_or_path: string,

227

options?: ModelOptions

228

): Promise<PreTrainedModel>;

229

}

230

231

class AutoModelForSemanticSegmentation {

232

static async from_pretrained(

233

pretrained_model_name_or_path: string,

234

options?: ModelOptions

235

): Promise<PreTrainedModel>;

236

}

237

238

class AutoModelForObjectDetection {

239

static async from_pretrained(

240

pretrained_model_name_or_path: string,

241

options?: ModelOptions

242

): Promise<PreTrainedModel>;

243

}

244

245

class AutoModelForZeroShotObjectDetection {

246

static async from_pretrained(

247

pretrained_model_name_or_path: string,

248

options?: ModelOptions

249

): Promise<PreTrainedModel>;

250

}

251

252

class AutoModelForDepthEstimation {

253

static async from_pretrained(

254

pretrained_model_name_or_path: string,

255

options?: ModelOptions

256

): Promise<PreTrainedModel>;

257

}

258

259

class AutoModelForImageToImage {

260

static async from_pretrained(

261

pretrained_model_name_or_path: string,

262

options?: ModelOptions

263

): Promise<PreTrainedModel>;

264

}

265

266

class AutoModelForImageFeatureExtraction {

267

static async from_pretrained(

268

pretrained_model_name_or_path: string,

269

options?: ModelOptions

270

): Promise<PreTrainedModel>;

271

}

272

```

273

274

#### Audio Models

275

276

```javascript { .api }

277

class AutoModelForAudioClassification {

278

static async from_pretrained(

279

pretrained_model_name_or_path: string,

280

options?: ModelOptions

281

): Promise<PreTrainedModel>;

282

}

283

284

class AutoModelForSpeechSeq2Seq {

285

static async from_pretrained(

286

pretrained_model_name_or_path: string,

287

options?: ModelOptions

288

): Promise<PreTrainedModel>;

289

}

290

291

class AutoModelForCTC {

292

static async from_pretrained(

293

pretrained_model_name_or_path: string,

294

options?: ModelOptions

295

): Promise<PreTrainedModel>;

296

}

297

298

class AutoModelForAudioFrameClassification {

299

static async from_pretrained(

300

pretrained_model_name_or_path: string,

301

options?: ModelOptions

302

): Promise<PreTrainedModel>;

303

}

304

305

class AutoModelForXVector {

306

static async from_pretrained(

307

pretrained_model_name_or_path: string,

308

options?: ModelOptions

309

): Promise<PreTrainedModel>;

310

}

311

312

class AutoModelForTextToWaveform {

313

static async from_pretrained(

314

pretrained_model_name_or_path: string,

315

options?: ModelOptions

316

): Promise<PreTrainedModel>;

317

}

318

319

class AutoModelForTextToSpectrogram {

320

static async from_pretrained(

321

pretrained_model_name_or_path: string,

322

options?: ModelOptions

323

): Promise<PreTrainedModel>;

324

}

325

```

326

327

#### Multimodal Models

328

329

```javascript { .api }

330

class AutoModelForDocumentQuestionAnswering {

331

static async from_pretrained(

332

pretrained_model_name_or_path: string,

333

options?: ModelOptions

334

): Promise<PreTrainedModel>;

335

}

336

337

class AutoModelForImageMatting {

338

static async from_pretrained(

339

pretrained_model_name_or_path: string,

340

options?: ModelOptions

341

): Promise<PreTrainedModel>;

342

}

343

344

class AutoModelForMaskGeneration {

345

static async from_pretrained(

346

pretrained_model_name_or_path: string,

347

options?: ModelOptions

348

): Promise<PreTrainedModel>;

349

}

350

```

351

352

### Base Model Classes

353

354

#### PreTrainedModel

355

356

Base class for all model implementations providing core functionality for inference and resource management.

357

358

```javascript { .api }

359

/**

360

* Base class for all pretrained models

361

*/

362

class PreTrainedModel {

363

/** Model configuration object */

364

config: any;

365

366

/**

367

* Run forward pass through the model

368

* @param model_inputs - Tokenized inputs or tensors

369

* @returns Promise resolving to model outputs

370

*/

371

async forward(model_inputs: any): Promise<any>;

372

373

/**

374

* Generate text using the model (for generation models)

375

* @param inputs - Input token IDs

376

* @param generation_config - Generation parameters

377

* @returns Promise resolving to generated token sequences

378

*/

379

async generate(

380

inputs: Tensor,

381

generation_config?: GenerationConfig

382

): Promise<Tensor[]>;

383

384

/**

385

* Dispose of model resources

386

*/

387

async dispose(): Promise<void>;

388

}

389

390

interface GenerationConfig {

391

/** Maximum number of new tokens to generate */

392

max_new_tokens?: number;

393

/** Maximum total length of generated sequence */

394

max_length?: number;

395

/** Minimum number of new tokens to generate */

396

min_new_tokens?: number;

397

/** Whether to use sampling for generation */

398

do_sample?: boolean;

399

/** Sampling temperature (0.0 to 1.0) */

400

temperature?: number;

401

/** Top-k sampling parameter */

402

top_k?: number;

403

/** Top-p (nucleus) sampling parameter */

404

top_p?: number;

405

/** Repetition penalty to avoid repetitive text */

406

repetition_penalty?: number;

407

/** Number of beams for beam search */

408

num_beams?: number;

409

/** Whether to use early stopping in beam search */

410

early_stopping?: boolean;

411

}

412

```

413

414

#### PreTrainedTokenizer

415

416

Base class for all tokenizer implementations providing text encoding and decoding functionality.

417

418

```javascript { .api }

419

/**

420

* Base class for all pretrained tokenizers

421

*/

422

class PreTrainedTokenizer {

423

/**

424

* Tokenize and encode text input

425

* @param text - Input text to tokenize

426

* @param options - Tokenization options

427

* @returns Tokenized output with input_ids and attention_mask

428

*/

429

async encode(

430

text: string | string[],

431

options?: TokenizeOptions

432

): Promise<{

433

input_ids: Tensor;

434

attention_mask: Tensor;

435

[key: string]: Tensor;

436

}>;

437

438

/**

439

* Tokenize text (alias for encode)

440

* @param text - Input text to tokenize

441

* @param options - Tokenization options

442

* @returns Tokenized output

443

*/

444

async __call__(

445

text: string | string[],

446

options?: TokenizeOptions

447

): Promise<{

448

input_ids: Tensor;

449

attention_mask: Tensor;

450

[key: string]: Tensor;

451

}>;

452

453

/**

454

* Decode token IDs back to text

455

* @param token_ids - Token IDs to decode

456

* @param options - Decoding options

457

* @returns Decoded text string

458

*/

459

decode(

460

token_ids: number[] | Tensor,

461

options?: DecodeOptions

462

): string;

463

464

/**

465

* Decode multiple sequences of token IDs

466

* @param sequences - Array of token ID sequences

467

* @param options - Decoding options

468

* @returns Array of decoded text strings

469

*/

470

batch_decode(

471

sequences: number[][] | Tensor[],

472

options?: DecodeOptions

473

): string[];

474

475

/**

476

* Get the vocabulary size

477

*/

478

get vocab_size(): number;

479

480

/**

481

* Dispose of tokenizer resources

482

*/

483

async dispose(): Promise<void>;

484

}

485

486

interface TokenizeOptions {

487

/** Add special tokens like [CLS], [SEP] */

488

add_special_tokens?: boolean;

489

/** Return attention mask */

490

return_attention_mask?: boolean;

491

/** Return token type IDs */

492

return_token_type_ids?: boolean;

493

/** Return tensor type ('pt' for PyTorch-like) */

494

return_tensors?: string;

495

/** Truncate to maximum length */

496

truncation?: boolean;

497

/** Maximum sequence length */

498

max_length?: number;

499

/** Padding strategy ('max_length', 'longest', etc.) */

500

padding?: boolean | string;

501

}

502

503

interface DecodeOptions {

504

/** Skip special tokens in decoded output */

505

skip_special_tokens?: boolean;

506

/** Clean up tokenization spaces */

507

clean_up_tokenization_spaces?: boolean;

508

}

509

```

510

511

### Specific Tokenizer Classes

512

513

The library includes numerous specific tokenizer implementations for different model architectures:

514

515

```javascript { .api }

516

// BERT family tokenizers

517

class BertTokenizer extends PreTrainedTokenizer {}

518

class DistilBertTokenizer extends PreTrainedTokenizer {}

519

class RobertaTokenizer extends PreTrainedTokenizer {}

520

class AlbertTokenizer extends PreTrainedTokenizer {}

521

522

// Transformer tokenizers

523

class T5Tokenizer extends PreTrainedTokenizer {}

524

class GPT2Tokenizer extends PreTrainedTokenizer {}

525

class BartTokenizer extends PreTrainedTokenizer {}

526

527

// Multilingual tokenizers

528

class XLMTokenizer extends PreTrainedTokenizer {}

529

class XLMRobertaTokenizer extends PreTrainedTokenizer {}

530

class MBartTokenizer extends PreTrainedTokenizer {}

531

class MBart50Tokenizer extends PreTrainedTokenizer {}

532

533

// Audio tokenizers

534

class WhisperTokenizer extends PreTrainedTokenizer {}

535

class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {}

536

class SpeechT5Tokenizer extends PreTrainedTokenizer {}

537

538

// Vision-language tokenizers

539

class CLIPTokenizer extends PreTrainedTokenizer {}

540

541

// And many more specific implementations...

542

```

543

544

## Usage Patterns

545

546

### Basic Model and Tokenizer Usage

547

548

```javascript

549

import { AutoModel, AutoTokenizer } from "@xenova/transformers";

550

551

// Load model and tokenizer

552

const tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased");

553

const model = await AutoModel.from_pretrained("Xenova/bert-base-uncased");

554

555

// Tokenize input

556

const inputs = await tokenizer("Hello, world!");

557

console.log(inputs.input_ids); // Tensor with token IDs

558

559

// Run inference

560

const outputs = await model(inputs);

561

console.log(outputs.last_hidden_state); // Model embeddings

562

```

563

564

### Text Generation

565

566

```javascript

567

import { AutoModelForCausalLM, AutoTokenizer } from "@xenova/transformers";

568

569

const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2");

570

const model = await AutoModelForCausalLM.from_pretrained("Xenova/gpt2");

571

572

const inputs = await tokenizer("The future of AI is");

573

const outputs = await model.generate(inputs.input_ids, {

574

max_new_tokens: 50,

575

do_sample: true,

576

temperature: 0.7,

577

});

578

579

const generated_text = tokenizer.decode(outputs[0], {

580

skip_special_tokens: true,

581

});

582

console.log(generated_text);

583

```

584

585

### Sequence-to-Sequence Tasks

586

587

```javascript

588

import { AutoModelForSeq2SeqLM, AutoTokenizer } from "@xenova/transformers";

589

590

const tokenizer = await AutoTokenizer.from_pretrained("Xenova/t5-small");

591

const model = await AutoModelForSeq2SeqLM.from_pretrained("Xenova/t5-small");

592

593

const inputs = await tokenizer("translate English to German: I love transformers!");

594

const outputs = await model.generate(inputs.input_ids);

595

const translation = tokenizer.decode(outputs[0], {

596

skip_special_tokens: true,

597

});

598

console.log(translation); // "Ich liebe Transformatoren!"

599

```

600

601

### Resource Management

602

603

```javascript

604

// Always dispose of models and tokenizers when done

605

await model.dispose();

606

await tokenizer.dispose();

607

```

608

609

### Specific Tokenizer Classes

610

611

For advanced use cases that require direct access to specific tokenizer implementations, Transformers.js exports individual tokenizer classes.

612

613

#### Common Tokenizer Classes

614

615

Popular tokenizer implementations for direct instantiation when you need fine-grained control.

616

617

```javascript { .api }

618

/**

619

* BERT tokenizer with WordPiece tokenization

620

*/

621

class BertTokenizer extends PreTrainedTokenizer {

622

static async from_pretrained(

623

pretrained_model_name_or_path: string,

624

options?: TokenizerOptions

625

): Promise<BertTokenizer>;

626

}

627

628

/**

629

* GPT-2 tokenizer with BPE tokenization and chat template support

630

*/

631

class GPT2Tokenizer extends PreTrainedTokenizer {

632

static async from_pretrained(

633

pretrained_model_name_or_path: string,

634

options?: TokenizerOptions

635

): Promise<GPT2Tokenizer>;

636

637

/** Default chat template for conversation formatting */

638

get default_chat_template(): string;

639

}

640

641

/**

642

* T5 tokenizer for encoder-decoder models

643

*/

644

class T5Tokenizer extends PreTrainedTokenizer {

645

static async from_pretrained(

646

pretrained_model_name_or_path: string,

647

options?: TokenizerOptions

648

): Promise<T5Tokenizer>;

649

}

650

```

651

652

#### Language Model Tokenizers

653

654

Specialized tokenizers for modern language models.

655

656

```javascript { .api }

657

/**

658

* LLaMA tokenizer with chat template and legacy support

659

*/

660

class LlamaTokenizer extends PreTrainedTokenizer {

661

static async from_pretrained(

662

pretrained_model_name_or_path: string,

663

options?: TokenizerOptions

664

): Promise<LlamaTokenizer>;

665

666

/** Whether to use legacy behavior */

667

readonly legacy: boolean;

668

669

/** Whether to use default system prompt */

670

readonly use_default_system_prompt: boolean;

671

672

/** Get dynamic chat template with system prompt support */

673

get default_chat_template(): string;

674

675

/** Default system prompt for chat */

676

static readonly DEFAULT_SYSTEM_PROMPT: string;

677

}

678

679

/**

680

* Code Llama tokenizer (extends LlamaTokenizer)

681

*/

682

class CodeLlamaTokenizer extends LlamaTokenizer {

683

static async from_pretrained(

684

pretrained_model_name_or_path: string,

685

options?: TokenizerOptions

686

): Promise<CodeLlamaTokenizer>;

687

}

688

689

/**

690

* Gemma tokenizer with chat template support

691

*/

692

class GemmaTokenizer extends PreTrainedTokenizer {

693

static async from_pretrained(

694

pretrained_model_name_or_path: string,

695

options?: TokenizerOptions

696

): Promise<GemmaTokenizer>;

697

698

/** Default chat template for conversation formatting */

699

get default_chat_template(): string;

700

}

701

```

702

703

#### Audio and Speech Tokenizers

704

705

Tokenizers specialized for audio and speech processing models.

706

707

```javascript { .api }

708

/**

709

* Whisper tokenizer for automatic speech recognition

710

*/

711

class WhisperTokenizer extends PreTrainedTokenizer {

712

static async from_pretrained(

713

pretrained_model_name_or_path: string,

714

options?: TokenizerOptions

715

): Promise<WhisperTokenizer>;

716

717

/**

718

* Decode ASR sequences with timestamp support

719

* @param sequences - Token sequences to decode

720

* @param options - Decoding options including timestamp handling

721

*/

722

_decode_asr(

723

sequences: number[][],

724

options?: {

725

time_precision?: number;

726

return_timestamps?: boolean;

727

return_language?: boolean;

728

}

729

): string[];

730

731

/**

732

* Get decoder prompt IDs for language/task specification

733

* @param options - Language and task options

734

*/

735

get_decoder_prompt_ids(options?: {

736

language?: string;

737

task?: string;

738

no_timestamps?: boolean;

739

}): number[];

740

741

/**

742

* Combine tokens into words with language-specific boundary detection

743

* @param tokens - Array of token objects with timestamps

744

* @param language - Language code for boundary detection

745

*/

746

combineTokensIntoWords(

747

tokens: Array<{ text: string; timestamp: [number, number] }>,

748

language?: string

749

): Array<{ word: string; timestamp: [number, number] }>;

750

}

751

752

/**

753

* Wav2Vec2 CTC tokenizer for connectionist temporal classification

754

*/

755

class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {

756

static async from_pretrained(

757

pretrained_model_name_or_path: string,

758

options?: TokenizerOptions

759

): Promise<Wav2Vec2CTCTokenizer>;

760

}

761

```

762

763

#### Multilingual and Translation Tokenizers

764

765

Tokenizers for multilingual models and translation tasks.

766

767

```javascript { .api }

768

/**

769

* XLM-RoBERTa multilingual tokenizer

770

*/

771

class XLMRobertaTokenizer extends PreTrainedTokenizer {

772

static async from_pretrained(

773

pretrained_model_name_or_path: string,

774

options?: TokenizerOptions

775

): Promise<XLMRobertaTokenizer>;

776

}

777

778

/**

779

* mBART tokenizer for multilingual translation

780

*/

781

class MBartTokenizer extends PreTrainedTokenizer {

782

static async from_pretrained(

783

pretrained_model_name_or_path: string,

784

options?: TokenizerOptions

785

): Promise<MBartTokenizer>;

786

787

/** Supported language codes */

788

readonly language_codes: string[];

789

790

/** Language code to token mapping */

791

readonly lang_to_token: Record<string, string>;

792

793

/**

794

* Build translation inputs with language tokens

795

* @param raw_inputs - Input text(s)

796

* @param tokenizer_options - Tokenization options

797

* @param generate_kwargs - Generation parameters including src_lang/tgt_lang

798

*/

799

_build_translation_inputs(

800

raw_inputs: string | string[],

801

tokenizer_options: any,

802

generate_kwargs: { src_lang?: string; tgt_lang?: string }

803

): any;

804

}

805

806

/**

807

* NLLB tokenizer for No Language Left Behind translation

808

*/

809

class NllbTokenizer extends PreTrainedTokenizer {

810

static async from_pretrained(

811

pretrained_model_name_or_path: string,

812

options?: TokenizerOptions

813

): Promise<NllbTokenizer>;

814

815

/**

816

* Build translation inputs with NLLB language codes

817

*/

818

_build_translation_inputs(

819

raw_inputs: string | string[],

820

tokenizer_options: any,

821

generate_kwargs: { src_lang?: string; tgt_lang?: string }

822

): any;

823

}

824

```

825

826

**Usage Examples:**

827

828

```javascript

829

import {

830

WhisperTokenizer,

831

LlamaTokenizer,

832

BertTokenizer

833

} from "@xenova/transformers";

834

835

// Direct tokenizer instantiation

836

const whisperTokenizer = await WhisperTokenizer.from_pretrained("openai/whisper-base");

837

const llamaTokenizer = await LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf");

838

839

// Access specialized methods

840

const promptIds = whisperTokenizer.get_decoder_prompt_ids({

841

language: "english",

842

task: "transcribe"

843

});

844

845

// Use chat templates

846

const chatTemplate = llamaTokenizer.default_chat_template;

847

const conversation = [

848

{ role: "user", content: "Hello!" },

849

{ role: "assistant", content: "Hi there!" }

850

];

851

const formatted = await llamaTokenizer.apply_chat_template(conversation);

852

```

853

854

## Types

855

856

```javascript { .api }

857

interface ModelOutput {

858

last_hidden_state?: Tensor;

859

logits?: Tensor;

860

hidden_states?: Tensor[];

861

attentions?: Tensor[];

862

[key: string]: any;

863

}

864

865

interface Tensor {

866

data: TypedArray;

867

dims: number[];

868

type: string;

869

size: number;

870

}

871

```