Tessl Tile for npm/@xenova/transformers@2.17.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

index.md models-tokenizers.md pipelines.md processors.md utilities.md

models-tokenizers.mddocs/

0
# Models and Tokenizers
1

2
This module provides Auto classes for automatic model and tokenizer selection, as well as direct access to specific model implementations for fine-grained control over model loading and inference.
3

4
## Capabilities
5

6
### Auto Classes - Automatic Selection
7

8
Auto classes automatically select the appropriate model or tokenizer implementation based on the model configuration, providing the most convenient interface for most use cases.
9

10
#### AutoModel
11

12
Loads the appropriate model architecture automatically based on the model configuration.
13

14
```javascript { .api }
15
/**
16
 * Instantiate a pretrained model automatically based on the model type
17
 * @param pretrained_model_name_or_path - Model identifier or path
18
 * @param options - Configuration options for model loading
19
 * @returns Promise resolving to the appropriate model instance
20
 */
21
class AutoModel {
22
  static async from_pretrained(
23
    pretrained_model_name_or_path: string,
24
    options?: ModelOptions
25
  ): Promise<PreTrainedModel>;
26
}
27

28
interface ModelOptions {
29
  /** Use quantized version of the model (default: true) */
30
  quantized?: boolean;
31
  /** Callback to track model download progress */
32
  progress_callback?: (progress: any) => void;
33
  /** Custom model configuration */
34
  config?: any;
35
  /** Directory to cache downloaded models */
36
  cache_dir?: string;
37
  /** Only use local files, don't download from remote */
38
  local_files_only?: boolean;
39
  /** Model revision/branch to use (default: 'main') */
40
  revision?: string;
41
  /** Specific model file name to use */
42
  model_file_name?: string;
43
}
44
```
45

46
**Usage Example:**
47

48
```javascript
49
import { AutoModel, AutoTokenizer } from "@xenova/transformers";
50

51
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased");
52
const model = await AutoModel.from_pretrained("Xenova/bert-base-uncased");
53

54
const inputs = await tokenizer("I love transformers!");
55
const outputs = await model(inputs);
56
```
57

58
#### AutoConfig
59

60
Automatically loads model configuration from pretrained models.
61

62
```javascript { .api }
63
/**
64
 * Load model configuration automatically
65
 * @param pretrained_model_name_or_path - Model identifier or path
66
 * @param options - Configuration options for loading
67
 * @returns Promise resolving to model configuration
68
 */
69
class AutoConfig {
70
  static async from_pretrained(
71
    pretrained_model_name_or_path: string,
72
    options?: ConfigOptions
73
  ): Promise<PretrainedConfig>;
74
}
75

76
interface ConfigOptions {
77
  /** Directory to cache downloaded files */
78
  cache_dir?: string;
79
  /** Only use local files, don't download from remote */
80
  local_files_only?: boolean;
81
  /** Model revision/branch to use (default: 'main') */
82
  revision?: string;
83
}
84

85
interface PretrainedConfig {
86
  model_type: string;
87
  architectures?: string[];
88
  vocab_size?: number;
89
  hidden_size?: number;
90
  num_attention_heads?: number;
91
  num_hidden_layers?: number;
92
  max_position_embeddings?: number;
93
  [key: string]: any;
94
}
95
```
96

97
**Usage Example:**
98

99
```javascript
100
import { AutoConfig } from "@xenova/transformers";
101

102
const config = await AutoConfig.from_pretrained("Xenova/bert-base-uncased");
103
console.log(config.model_type); // "bert"
104
console.log(config.vocab_size); // 30522
105
```
106

107
#### AutoTokenizer
108

109
Automatically selects and loads the appropriate tokenizer based on the tokenizer configuration.
110

111
```javascript { .api }
112
/**
113
 * Instantiate a tokenizer automatically based on the tokenizer type
114
 * @param pretrained_model_name_or_path - Model identifier or path
115
 * @param options - Configuration options for tokenizer loading
116
 * @returns Promise resolving to the appropriate tokenizer instance
117
 */
118
class AutoTokenizer {
119
  static async from_pretrained(
120
    pretrained_model_name_or_path: string,
121
    options?: TokenizerOptions
122
  ): Promise<PreTrainedTokenizer>;
123
}
124

125
interface TokenizerOptions {
126
  /** Use quantized version (default: true) */
127
  quantized?: boolean;
128
  /** Callback to track download progress */
129
  progress_callback?: (progress: any) => void;
130
  /** Custom configuration */
131
  config?: any;
132
  /** Directory to cache downloaded files */
133
  cache_dir?: string;
134
  /** Only use local files, don't download from remote */
135
  local_files_only?: boolean;
136
  /** Model revision/branch to use (default: 'main') */
137
  revision?: string;
138
  /** Whether to use legacy tokenizer behavior */
139
  legacy?: boolean;
140
}
141
```
142

143
**Usage Example:**
144

145
```javascript
146
import { AutoTokenizer } from "@xenova/transformers";
147

148
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/t5-small");
149
const inputs = await tokenizer("translate English to German: Hello world");
150
const decoded = tokenizer.decode(inputs.input_ids[0]);
151
```
152

153
### Task-Specific Auto Model Classes
154

155
These classes automatically load models optimized for specific tasks:
156

157
#### Text Processing Models
158

159
```javascript { .api }
160
class AutoModelForSequenceClassification {
161
  static async from_pretrained(
162
    pretrained_model_name_or_path: string,
163
    options?: ModelOptions
164
  ): Promise<PreTrainedModel>;
165
}
166

167
class AutoModelForTokenClassification {
168
  static async from_pretrained(
169
    pretrained_model_name_or_path: string,
170
    options?: ModelOptions
171
  ): Promise<PreTrainedModel>;
172
}
173

174
class AutoModelForQuestionAnswering {
175
  static async from_pretrained(
176
    pretrained_model_name_or_path: string,
177
    options?: ModelOptions
178
  ): Promise<PreTrainedModel>;
179
}
180

181
class AutoModelForMaskedLM {
182
  static async from_pretrained(
183
    pretrained_model_name_or_path: string,
184
    options?: ModelOptions
185
  ): Promise<PreTrainedModel>;
186
}
187

188
class AutoModelForCausalLM {
189
  static async from_pretrained(
190
    pretrained_model_name_or_path: string,
191
    options?: ModelOptions
192
  ): Promise<PreTrainedModel>;
193
}
194
```
195

196
#### Sequence-to-Sequence Models
197

198
```javascript { .api }
199
class AutoModelForSeq2SeqLM {
200
  static async from_pretrained(
201
    pretrained_model_name_or_path: string,
202
    options?: ModelOptions
203
  ): Promise<PreTrainedModel>;
204
}
205

206
class AutoModelForVision2Seq {
207
  static async from_pretrained(
208
    pretrained_model_name_or_path: string,
209
    options?: ModelOptions
210
  ): Promise<PreTrainedModel>;
211
}
212
```
213

214
#### Vision Models
215

216
```javascript { .api }
217
class AutoModelForImageClassification {
218
  static async from_pretrained(
219
    pretrained_model_name_or_path: string,
220
    options?: ModelOptions
221
  ): Promise<PreTrainedModel>;
222
}
223

224
class AutoModelForImageSegmentation {
225
  static async from_pretrained(
226
    pretrained_model_name_or_path: string,
227
    options?: ModelOptions
228
  ): Promise<PreTrainedModel>;
229
}
230

231
class AutoModelForSemanticSegmentation {
232
  static async from_pretrained(
233
    pretrained_model_name_or_path: string,
234
    options?: ModelOptions
235
  ): Promise<PreTrainedModel>;
236
}
237

238
class AutoModelForObjectDetection {
239
  static async from_pretrained(
240
    pretrained_model_name_or_path: string,
241
    options?: ModelOptions
242
  ): Promise<PreTrainedModel>;
243
}
244

245
class AutoModelForZeroShotObjectDetection {
246
  static async from_pretrained(
247
    pretrained_model_name_or_path: string,
248
    options?: ModelOptions
249
  ): Promise<PreTrainedModel>;
250
}
251

252
class AutoModelForDepthEstimation {
253
  static async from_pretrained(
254
    pretrained_model_name_or_path: string,
255
    options?: ModelOptions
256
  ): Promise<PreTrainedModel>;
257
}
258

259
class AutoModelForImageToImage {
260
  static async from_pretrained(
261
    pretrained_model_name_or_path: string,
262
    options?: ModelOptions
263
  ): Promise<PreTrainedModel>;
264
}
265

266
class AutoModelForImageFeatureExtraction {
267
  static async from_pretrained(
268
    pretrained_model_name_or_path: string,
269
    options?: ModelOptions
270
  ): Promise<PreTrainedModel>;
271
}
272
```
273

274
#### Audio Models
275

276
```javascript { .api }
277
class AutoModelForAudioClassification {
278
  static async from_pretrained(
279
    pretrained_model_name_or_path: string,
280
    options?: ModelOptions
281
  ): Promise<PreTrainedModel>;
282
}
283

284
class AutoModelForSpeechSeq2Seq {
285
  static async from_pretrained(
286
    pretrained_model_name_or_path: string,
287
    options?: ModelOptions
288
  ): Promise<PreTrainedModel>;
289
}
290

291
class AutoModelForCTC {
292
  static async from_pretrained(
293
    pretrained_model_name_or_path: string,
294
    options?: ModelOptions
295
  ): Promise<PreTrainedModel>;
296
}
297

298
class AutoModelForAudioFrameClassification {
299
  static async from_pretrained(
300
    pretrained_model_name_or_path: string,
301
    options?: ModelOptions
302
  ): Promise<PreTrainedModel>;
303
}
304

305
class AutoModelForXVector {
306
  static async from_pretrained(
307
    pretrained_model_name_or_path: string,
308
    options?: ModelOptions
309
  ): Promise<PreTrainedModel>;
310
}
311

312
class AutoModelForTextToWaveform {
313
  static async from_pretrained(
314
    pretrained_model_name_or_path: string,
315
    options?: ModelOptions
316
  ): Promise<PreTrainedModel>;
317
}
318

319
class AutoModelForTextToSpectrogram {
320
  static async from_pretrained(
321
    pretrained_model_name_or_path: string,
322
    options?: ModelOptions
323
  ): Promise<PreTrainedModel>;
324
}
325
```
326

327
#### Multimodal Models
328

329
```javascript { .api }
330
class AutoModelForDocumentQuestionAnswering {
331
  static async from_pretrained(
332
    pretrained_model_name_or_path: string,
333
    options?: ModelOptions
334
  ): Promise<PreTrainedModel>;
335
}
336

337
class AutoModelForImageMatting {
338
  static async from_pretrained(
339
    pretrained_model_name_or_path: string,
340
    options?: ModelOptions
341
  ): Promise<PreTrainedModel>;
342
}
343

344
class AutoModelForMaskGeneration {
345
  static async from_pretrained(
346
    pretrained_model_name_or_path: string,
347
    options?: ModelOptions
348
  ): Promise<PreTrainedModel>;
349
}
350
```
351

352
### Base Model Classes
353

354
#### PreTrainedModel
355

356
Base class for all model implementations providing core functionality for inference and resource management.
357

358
```javascript { .api }
359
/**
360
 * Base class for all pretrained models
361
 */
362
class PreTrainedModel {
363
  /** Model configuration object */
364
  config: any;
365
  
366
  /**
367
   * Run forward pass through the model
368
   * @param model_inputs - Tokenized inputs or tensors
369
   * @returns Promise resolving to model outputs
370
   */
371
  async forward(model_inputs: any): Promise<any>;
372
  
373
  /**
374
   * Generate text using the model (for generation models)
375
   * @param inputs - Input token IDs
376
   * @param generation_config - Generation parameters
377
   * @returns Promise resolving to generated token sequences
378
   */
379
  async generate(
380
    inputs: Tensor,
381
    generation_config?: GenerationConfig
382
  ): Promise<Tensor[]>;
383
  
384
  /**
385
   * Dispose of model resources
386
   */
387
  async dispose(): Promise<void>;
388
}
389

390
interface GenerationConfig {
391
  /** Maximum number of new tokens to generate */
392
  max_new_tokens?: number;
393
  /** Maximum total length of generated sequence */
394
  max_length?: number;
395
  /** Minimum number of new tokens to generate */
396
  min_new_tokens?: number;
397
  /** Whether to use sampling for generation */
398
  do_sample?: boolean;
399
  /** Sampling temperature (0.0 to 1.0) */
400
  temperature?: number;
401
  /** Top-k sampling parameter */
402
  top_k?: number;
403
  /** Top-p (nucleus) sampling parameter */
404
  top_p?: number;
405
  /** Repetition penalty to avoid repetitive text */
406
  repetition_penalty?: number;
407
  /** Number of beams for beam search */
408
  num_beams?: number;
409
  /** Whether to use early stopping in beam search */
410
  early_stopping?: boolean;
411
}
412
```
413

414
#### PreTrainedTokenizer
415

416
Base class for all tokenizer implementations providing text encoding and decoding functionality.
417

418
```javascript { .api }
419
/**
420
 * Base class for all pretrained tokenizers
421
 */
422
class PreTrainedTokenizer {
423
  /**
424
   * Tokenize and encode text input
425
   * @param text - Input text to tokenize
426
   * @param options - Tokenization options
427
   * @returns Tokenized output with input_ids and attention_mask
428
   */
429
  async encode(
430
    text: string | string[],
431
    options?: TokenizeOptions
432
  ): Promise<{
433
    input_ids: Tensor;
434
    attention_mask: Tensor;
435
    [key: string]: Tensor;
436
  }>;
437
  
438
  /**
439
   * Tokenize text (alias for encode)
440
   * @param text - Input text to tokenize
441
   * @param options - Tokenization options
442
   * @returns Tokenized output
443
   */
444
  async __call__(
445
    text: string | string[],
446
    options?: TokenizeOptions
447
  ): Promise<{
448
    input_ids: Tensor;
449
    attention_mask: Tensor;
450
    [key: string]: Tensor;
451
  }>;
452
  
453
  /**
454
   * Decode token IDs back to text
455
   * @param token_ids - Token IDs to decode
456
   * @param options - Decoding options
457
   * @returns Decoded text string
458
   */
459
  decode(
460
    token_ids: number[] | Tensor,
461
    options?: DecodeOptions
462
  ): string;
463
  
464
  /**
465
   * Decode multiple sequences of token IDs
466
   * @param sequences - Array of token ID sequences
467
   * @param options - Decoding options
468
   * @returns Array of decoded text strings
469
   */
470
  batch_decode(
471
    sequences: number[][] | Tensor[],
472
    options?: DecodeOptions
473
  ): string[];
474
  
475
  /**
476
   * Get the vocabulary size
477
   */
478
  get vocab_size(): number;
479
  
480
  /**
481
   * Dispose of tokenizer resources
482
   */
483
  async dispose(): Promise<void>;
484
}
485

486
interface TokenizeOptions {
487
  /** Add special tokens like [CLS], [SEP] */
488
  add_special_tokens?: boolean;
489
  /** Return attention mask */
490
  return_attention_mask?: boolean;
491
  /** Return token type IDs */
492
  return_token_type_ids?: boolean;
493
  /** Return tensor type ('pt' for PyTorch-like) */
494
  return_tensors?: string;
495
  /** Truncate to maximum length */
496
  truncation?: boolean;
497
  /** Maximum sequence length */
498
  max_length?: number;
499
  /** Padding strategy ('max_length', 'longest', etc.) */
500
  padding?: boolean | string;
501
}
502

503
interface DecodeOptions {
504
  /** Skip special tokens in decoded output */
505
  skip_special_tokens?: boolean;
506
  /** Clean up tokenization spaces */
507
  clean_up_tokenization_spaces?: boolean;
508
}
509
```
510

511
### Specific Tokenizer Classes
512

513
The library includes numerous specific tokenizer implementations for different model architectures:
514

515
```javascript { .api }
516
// BERT family tokenizers
517
class BertTokenizer extends PreTrainedTokenizer {}
518
class DistilBertTokenizer extends PreTrainedTokenizer {}
519
class RobertaTokenizer extends PreTrainedTokenizer {}
520
class AlbertTokenizer extends PreTrainedTokenizer {}
521

522
// Transformer tokenizers
523
class T5Tokenizer extends PreTrainedTokenizer {}
524
class GPT2Tokenizer extends PreTrainedTokenizer {}
525
class BartTokenizer extends PreTrainedTokenizer {}
526

527
// Multilingual tokenizers
528
class XLMTokenizer extends PreTrainedTokenizer {}
529
class XLMRobertaTokenizer extends PreTrainedTokenizer {}
530
class MBartTokenizer extends PreTrainedTokenizer {}
531
class MBart50Tokenizer extends PreTrainedTokenizer {}
532

533
// Audio tokenizers
534
class WhisperTokenizer extends PreTrainedTokenizer {}
535
class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {}
536
class SpeechT5Tokenizer extends PreTrainedTokenizer {}
537

538
// Vision-language tokenizers
539
class CLIPTokenizer extends PreTrainedTokenizer {}
540

541
// And many more specific implementations...
542
```
543

544
## Usage Patterns
545

546
### Basic Model and Tokenizer Usage
547

548
```javascript
549
import { AutoModel, AutoTokenizer } from "@xenova/transformers";
550

551
// Load model and tokenizer
552
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased");
553
const model = await AutoModel.from_pretrained("Xenova/bert-base-uncased");
554

555
// Tokenize input
556
const inputs = await tokenizer("Hello, world!");
557
console.log(inputs.input_ids); // Tensor with token IDs
558

559
// Run inference
560
const outputs = await model(inputs);
561
console.log(outputs.last_hidden_state); // Model embeddings
562
```
563

564
### Text Generation
565

566
```javascript
567
import { AutoModelForCausalLM, AutoTokenizer } from "@xenova/transformers";
568

569
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2");
570
const model = await AutoModelForCausalLM.from_pretrained("Xenova/gpt2");
571

572
const inputs = await tokenizer("The future of AI is");
573
const outputs = await model.generate(inputs.input_ids, {
574
  max_new_tokens: 50,
575
  do_sample: true,
576
  temperature: 0.7,
577
});
578

579
const generated_text = tokenizer.decode(outputs[0], {
580
  skip_special_tokens: true,
581
});
582
console.log(generated_text);
583
```
584

585
### Sequence-to-Sequence Tasks
586

587
```javascript
588
import { AutoModelForSeq2SeqLM, AutoTokenizer } from "@xenova/transformers";
589

590
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/t5-small");
591
const model = await AutoModelForSeq2SeqLM.from_pretrained("Xenova/t5-small");
592

593
const inputs = await tokenizer("translate English to German: I love transformers!");
594
const outputs = await model.generate(inputs.input_ids);
595
const translation = tokenizer.decode(outputs[0], {
596
  skip_special_tokens: true,
597
});
598
console.log(translation); // "Ich liebe Transformatoren!"
599
```
600

601
### Resource Management
602

603
```javascript
604
// Always dispose of models and tokenizers when done
605
await model.dispose();
606
await tokenizer.dispose();
607
```
608

609
### Specific Tokenizer Classes
610

611
For advanced use cases that require direct access to specific tokenizer implementations, Transformers.js exports individual tokenizer classes.
612

613
#### Common Tokenizer Classes
614

615
Popular tokenizer implementations for direct instantiation when you need fine-grained control.
616

617
```javascript { .api }
618
/**
619
 * BERT tokenizer with WordPiece tokenization
620
 */
621
class BertTokenizer extends PreTrainedTokenizer {
622
  static async from_pretrained(
623
    pretrained_model_name_or_path: string,
624
    options?: TokenizerOptions
625
  ): Promise<BertTokenizer>;
626
}
627

628
/**
629
 * GPT-2 tokenizer with BPE tokenization and chat template support
630
 */
631
class GPT2Tokenizer extends PreTrainedTokenizer {
632
  static async from_pretrained(
633
    pretrained_model_name_or_path: string,
634
    options?: TokenizerOptions
635
  ): Promise<GPT2Tokenizer>;
636
  
637
  /** Default chat template for conversation formatting */
638
  get default_chat_template(): string;
639
}
640

641
/**
642
 * T5 tokenizer for encoder-decoder models
643
 */
644
class T5Tokenizer extends PreTrainedTokenizer {
645
  static async from_pretrained(
646
    pretrained_model_name_or_path: string,
647
    options?: TokenizerOptions
648
  ): Promise<T5Tokenizer>;
649
}
650
```
651

652
#### Language Model Tokenizers
653

654
Specialized tokenizers for modern language models.
655

656
```javascript { .api }
657
/**
658
 * LLaMA tokenizer with chat template and legacy support
659
 */
660
class LlamaTokenizer extends PreTrainedTokenizer {
661
  static async from_pretrained(
662
    pretrained_model_name_or_path: string,
663
    options?: TokenizerOptions
664
  ): Promise<LlamaTokenizer>;
665
  
666
  /** Whether to use legacy behavior */
667
  readonly legacy: boolean;
668
  
669
  /** Whether to use default system prompt */
670
  readonly use_default_system_prompt: boolean;
671
  
672
  /** Get dynamic chat template with system prompt support */
673
  get default_chat_template(): string;
674
  
675
  /** Default system prompt for chat */
676
  static readonly DEFAULT_SYSTEM_PROMPT: string;
677
}
678

679
/**
680
 * Code Llama tokenizer (extends LlamaTokenizer)
681
 */
682
class CodeLlamaTokenizer extends LlamaTokenizer {
683
  static async from_pretrained(
684
    pretrained_model_name_or_path: string,
685
    options?: TokenizerOptions
686
  ): Promise<CodeLlamaTokenizer>;
687
}
688

689
/**
690
 * Gemma tokenizer with chat template support
691
 */
692
class GemmaTokenizer extends PreTrainedTokenizer {
693
  static async from_pretrained(
694
    pretrained_model_name_or_path: string,
695
    options?: TokenizerOptions
696
  ): Promise<GemmaTokenizer>;
697
  
698
  /** Default chat template for conversation formatting */
699
  get default_chat_template(): string;
700
}
701
```
702

703
#### Audio and Speech Tokenizers
704

705
Tokenizers specialized for audio and speech processing models.
706

707
```javascript { .api }
708
/**
709
 * Whisper tokenizer for automatic speech recognition
710
 */
711
class WhisperTokenizer extends PreTrainedTokenizer {
712
  static async from_pretrained(
713
    pretrained_model_name_or_path: string,
714
    options?: TokenizerOptions
715
  ): Promise<WhisperTokenizer>;
716
  
717
  /**
718
   * Decode ASR sequences with timestamp support
719
   * @param sequences - Token sequences to decode
720
   * @param options - Decoding options including timestamp handling
721
   */
722
  _decode_asr(
723
    sequences: number[][],
724
    options?: {
725
      time_precision?: number;
726
      return_timestamps?: boolean;
727
      return_language?: boolean;
728
    }
729
  ): string[];
730
  
731
  /**
732
   * Get decoder prompt IDs for language/task specification
733
   * @param options - Language and task options
734
   */
735
  get_decoder_prompt_ids(options?: {
736
    language?: string;
737
    task?: string;
738
    no_timestamps?: boolean;
739
  }): number[];
740
  
741
  /**
742
   * Combine tokens into words with language-specific boundary detection
743
   * @param tokens - Array of token objects with timestamps
744
   * @param language - Language code for boundary detection
745
   */
746
  combineTokensIntoWords(
747
    tokens: Array<{ text: string; timestamp: [number, number] }>,
748
    language?: string
749
  ): Array<{ word: string; timestamp: [number, number] }>;
750
}
751

752
/**
753
 * Wav2Vec2 CTC tokenizer for connectionist temporal classification
754
 */
755
class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {
756
  static async from_pretrained(
757
    pretrained_model_name_or_path: string,
758
    options?: TokenizerOptions
759
  ): Promise<Wav2Vec2CTCTokenizer>;
760
}
761
```
762

763
#### Multilingual and Translation Tokenizers
764

765
Tokenizers for multilingual models and translation tasks.
766

767
```javascript { .api }
768
/**
769
 * XLM-RoBERTa multilingual tokenizer
770
 */
771
class XLMRobertaTokenizer extends PreTrainedTokenizer {
772
  static async from_pretrained(
773
    pretrained_model_name_or_path: string,
774
    options?: TokenizerOptions
775
  ): Promise<XLMRobertaTokenizer>;
776
}
777

778
/**
779
 * mBART tokenizer for multilingual translation
780
 */
781
class MBartTokenizer extends PreTrainedTokenizer {
782
  static async from_pretrained(
783
    pretrained_model_name_or_path: string,
784
    options?: TokenizerOptions
785
  ): Promise<MBartTokenizer>;
786
  
787
  /** Supported language codes */
788
  readonly language_codes: string[];
789
  
790
  /** Language code to token mapping */
791
  readonly lang_to_token: Record<string, string>;
792
  
793
  /**
794
   * Build translation inputs with language tokens
795
   * @param raw_inputs - Input text(s)
796
   * @param tokenizer_options - Tokenization options
797
   * @param generate_kwargs - Generation parameters including src_lang/tgt_lang
798
   */
799
  _build_translation_inputs(
800
    raw_inputs: string | string[],
801
    tokenizer_options: any,
802
    generate_kwargs: { src_lang?: string; tgt_lang?: string }
803
  ): any;
804
}
805

806
/**
807
 * NLLB tokenizer for No Language Left Behind translation
808
 */
809
class NllbTokenizer extends PreTrainedTokenizer {
810
  static async from_pretrained(
811
    pretrained_model_name_or_path: string,
812
    options?: TokenizerOptions
813
  ): Promise<NllbTokenizer>;
814
  
815
  /**
816
   * Build translation inputs with NLLB language codes
817
   */
818
  _build_translation_inputs(
819
    raw_inputs: string | string[],
820
    tokenizer_options: any,
821
    generate_kwargs: { src_lang?: string; tgt_lang?: string }
822
  ): any;
823
}
824
```
825

826
**Usage Examples:**
827

828
```javascript
829
import { 
830
  WhisperTokenizer, 
831
  LlamaTokenizer, 
832
  BertTokenizer 
833
} from "@xenova/transformers";
834

835
// Direct tokenizer instantiation
836
const whisperTokenizer = await WhisperTokenizer.from_pretrained("openai/whisper-base");
837
const llamaTokenizer = await LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf");
838

839
// Access specialized methods
840
const promptIds = whisperTokenizer.get_decoder_prompt_ids({
841
  language: "english",
842
  task: "transcribe"
843
});
844

845
// Use chat templates
846
const chatTemplate = llamaTokenizer.default_chat_template;
847
const conversation = [
848
  { role: "user", content: "Hello!" },
849
  { role: "assistant", content: "Hi there!" }
850
];
851
const formatted = await llamaTokenizer.apply_chat_template(conversation);
852
```
853

854
## Types
855

856
```javascript { .api }
857
interface ModelOutput {
858
  last_hidden_state?: Tensor;
859
  logits?: Tensor;
860
  hidden_states?: Tensor[];
861
  attentions?: Tensor[];
862
  [key: string]: any;
863
}
864

865
interface Tensor {
866
  data: TypedArray;
867
  dims: number[];
868
  type: string;
869
  size: number;
870
}
871
```

Version

Tile

Files

models-tokenizers.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

models-tokenizers.mddocs/