0
# Models and Tokenizers
1
2
This module provides Auto classes for automatic model and tokenizer selection, as well as direct access to specific model implementations for fine-grained control over model loading and inference.
3
4
## Capabilities
5
6
### Auto Classes - Automatic Selection
7
8
Auto classes automatically select the appropriate model or tokenizer implementation based on the model configuration, providing the most convenient interface for most use cases.
9
10
#### AutoModel
11
12
Loads the appropriate model architecture automatically based on the model configuration.
13
14
```javascript { .api }
15
/**
16
* Instantiate a pretrained model automatically based on the model type
17
* @param pretrained_model_name_or_path - Model identifier or path
18
* @param options - Configuration options for model loading
19
* @returns Promise resolving to the appropriate model instance
20
*/
21
class AutoModel {
22
static async from_pretrained(
23
pretrained_model_name_or_path: string,
24
options?: ModelOptions
25
): Promise<PreTrainedModel>;
26
}
27
28
interface ModelOptions {
29
/** Use quantized version of the model (default: true) */
30
quantized?: boolean;
31
/** Callback to track model download progress */
32
progress_callback?: (progress: any) => void;
33
/** Custom model configuration */
34
config?: any;
35
/** Directory to cache downloaded models */
36
cache_dir?: string;
37
/** Only use local files, don't download from remote */
38
local_files_only?: boolean;
39
/** Model revision/branch to use (default: 'main') */
40
revision?: string;
41
/** Specific model file name to use */
42
model_file_name?: string;
43
}
44
```
45
46
**Usage Example:**
47
48
```javascript
49
import { AutoModel, AutoTokenizer } from "@xenova/transformers";
50
51
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased");
52
const model = await AutoModel.from_pretrained("Xenova/bert-base-uncased");
53
54
const inputs = await tokenizer("I love transformers!");
55
const outputs = await model(inputs);
56
```
57
58
#### AutoConfig
59
60
Automatically loads model configuration from pretrained models.
61
62
```javascript { .api }
63
/**
64
* Load model configuration automatically
65
* @param pretrained_model_name_or_path - Model identifier or path
66
* @param options - Configuration options for loading
67
* @returns Promise resolving to model configuration
68
*/
69
class AutoConfig {
70
static async from_pretrained(
71
pretrained_model_name_or_path: string,
72
options?: ConfigOptions
73
): Promise<PretrainedConfig>;
74
}
75
76
interface ConfigOptions {
77
/** Directory to cache downloaded files */
78
cache_dir?: string;
79
/** Only use local files, don't download from remote */
80
local_files_only?: boolean;
81
/** Model revision/branch to use (default: 'main') */
82
revision?: string;
83
}
84
85
interface PretrainedConfig {
86
model_type: string;
87
architectures?: string[];
88
vocab_size?: number;
89
hidden_size?: number;
90
num_attention_heads?: number;
91
num_hidden_layers?: number;
92
max_position_embeddings?: number;
93
[key: string]: any;
94
}
95
```
96
97
**Usage Example:**
98
99
```javascript
100
import { AutoConfig } from "@xenova/transformers";
101
102
const config = await AutoConfig.from_pretrained("Xenova/bert-base-uncased");
103
console.log(config.model_type); // "bert"
104
console.log(config.vocab_size); // 30522
105
```
106
107
#### AutoTokenizer
108
109
Automatically selects and loads the appropriate tokenizer based on the tokenizer configuration.
110
111
```javascript { .api }
112
/**
113
* Instantiate a tokenizer automatically based on the tokenizer type
114
* @param pretrained_model_name_or_path - Model identifier or path
115
* @param options - Configuration options for tokenizer loading
116
* @returns Promise resolving to the appropriate tokenizer instance
117
*/
118
class AutoTokenizer {
119
static async from_pretrained(
120
pretrained_model_name_or_path: string,
121
options?: TokenizerOptions
122
): Promise<PreTrainedTokenizer>;
123
}
124
125
interface TokenizerOptions {
126
/** Use quantized version (default: true) */
127
quantized?: boolean;
128
/** Callback to track download progress */
129
progress_callback?: (progress: any) => void;
130
/** Custom configuration */
131
config?: any;
132
/** Directory to cache downloaded files */
133
cache_dir?: string;
134
/** Only use local files, don't download from remote */
135
local_files_only?: boolean;
136
/** Model revision/branch to use (default: 'main') */
137
revision?: string;
138
/** Whether to use legacy tokenizer behavior */
139
legacy?: boolean;
140
}
141
```
142
143
**Usage Example:**
144
145
```javascript
146
import { AutoTokenizer } from "@xenova/transformers";
147
148
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/t5-small");
149
const inputs = await tokenizer("translate English to German: Hello world");
150
const decoded = tokenizer.decode(inputs.input_ids[0]);
151
```
152
153
### Task-Specific Auto Model Classes
154
155
These classes automatically load models optimized for specific tasks:
156
157
#### Text Processing Models
158
159
```javascript { .api }
160
class AutoModelForSequenceClassification {
161
static async from_pretrained(
162
pretrained_model_name_or_path: string,
163
options?: ModelOptions
164
): Promise<PreTrainedModel>;
165
}
166
167
class AutoModelForTokenClassification {
168
static async from_pretrained(
169
pretrained_model_name_or_path: string,
170
options?: ModelOptions
171
): Promise<PreTrainedModel>;
172
}
173
174
class AutoModelForQuestionAnswering {
175
static async from_pretrained(
176
pretrained_model_name_or_path: string,
177
options?: ModelOptions
178
): Promise<PreTrainedModel>;
179
}
180
181
class AutoModelForMaskedLM {
182
static async from_pretrained(
183
pretrained_model_name_or_path: string,
184
options?: ModelOptions
185
): Promise<PreTrainedModel>;
186
}
187
188
class AutoModelForCausalLM {
189
static async from_pretrained(
190
pretrained_model_name_or_path: string,
191
options?: ModelOptions
192
): Promise<PreTrainedModel>;
193
}
194
```
195
196
#### Sequence-to-Sequence Models
197
198
```javascript { .api }
199
class AutoModelForSeq2SeqLM {
200
static async from_pretrained(
201
pretrained_model_name_or_path: string,
202
options?: ModelOptions
203
): Promise<PreTrainedModel>;
204
}
205
206
class AutoModelForVision2Seq {
207
static async from_pretrained(
208
pretrained_model_name_or_path: string,
209
options?: ModelOptions
210
): Promise<PreTrainedModel>;
211
}
212
```
213
214
#### Vision Models
215
216
```javascript { .api }
217
class AutoModelForImageClassification {
218
static async from_pretrained(
219
pretrained_model_name_or_path: string,
220
options?: ModelOptions
221
): Promise<PreTrainedModel>;
222
}
223
224
class AutoModelForImageSegmentation {
225
static async from_pretrained(
226
pretrained_model_name_or_path: string,
227
options?: ModelOptions
228
): Promise<PreTrainedModel>;
229
}
230
231
class AutoModelForSemanticSegmentation {
232
static async from_pretrained(
233
pretrained_model_name_or_path: string,
234
options?: ModelOptions
235
): Promise<PreTrainedModel>;
236
}
237
238
class AutoModelForObjectDetection {
239
static async from_pretrained(
240
pretrained_model_name_or_path: string,
241
options?: ModelOptions
242
): Promise<PreTrainedModel>;
243
}
244
245
class AutoModelForZeroShotObjectDetection {
246
static async from_pretrained(
247
pretrained_model_name_or_path: string,
248
options?: ModelOptions
249
): Promise<PreTrainedModel>;
250
}
251
252
class AutoModelForDepthEstimation {
253
static async from_pretrained(
254
pretrained_model_name_or_path: string,
255
options?: ModelOptions
256
): Promise<PreTrainedModel>;
257
}
258
259
class AutoModelForImageToImage {
260
static async from_pretrained(
261
pretrained_model_name_or_path: string,
262
options?: ModelOptions
263
): Promise<PreTrainedModel>;
264
}
265
266
class AutoModelForImageFeatureExtraction {
267
static async from_pretrained(
268
pretrained_model_name_or_path: string,
269
options?: ModelOptions
270
): Promise<PreTrainedModel>;
271
}
272
```
273
274
#### Audio Models
275
276
```javascript { .api }
277
class AutoModelForAudioClassification {
278
static async from_pretrained(
279
pretrained_model_name_or_path: string,
280
options?: ModelOptions
281
): Promise<PreTrainedModel>;
282
}
283
284
class AutoModelForSpeechSeq2Seq {
285
static async from_pretrained(
286
pretrained_model_name_or_path: string,
287
options?: ModelOptions
288
): Promise<PreTrainedModel>;
289
}
290
291
class AutoModelForCTC {
292
static async from_pretrained(
293
pretrained_model_name_or_path: string,
294
options?: ModelOptions
295
): Promise<PreTrainedModel>;
296
}
297
298
class AutoModelForAudioFrameClassification {
299
static async from_pretrained(
300
pretrained_model_name_or_path: string,
301
options?: ModelOptions
302
): Promise<PreTrainedModel>;
303
}
304
305
class AutoModelForXVector {
306
static async from_pretrained(
307
pretrained_model_name_or_path: string,
308
options?: ModelOptions
309
): Promise<PreTrainedModel>;
310
}
311
312
class AutoModelForTextToWaveform {
313
static async from_pretrained(
314
pretrained_model_name_or_path: string,
315
options?: ModelOptions
316
): Promise<PreTrainedModel>;
317
}
318
319
class AutoModelForTextToSpectrogram {
320
static async from_pretrained(
321
pretrained_model_name_or_path: string,
322
options?: ModelOptions
323
): Promise<PreTrainedModel>;
324
}
325
```
326
327
#### Multimodal Models
328
329
```javascript { .api }
330
class AutoModelForDocumentQuestionAnswering {
331
static async from_pretrained(
332
pretrained_model_name_or_path: string,
333
options?: ModelOptions
334
): Promise<PreTrainedModel>;
335
}
336
337
class AutoModelForImageMatting {
338
static async from_pretrained(
339
pretrained_model_name_or_path: string,
340
options?: ModelOptions
341
): Promise<PreTrainedModel>;
342
}
343
344
class AutoModelForMaskGeneration {
345
static async from_pretrained(
346
pretrained_model_name_or_path: string,
347
options?: ModelOptions
348
): Promise<PreTrainedModel>;
349
}
350
```
351
352
### Base Model Classes
353
354
#### PreTrainedModel
355
356
Base class for all model implementations providing core functionality for inference and resource management.
357
358
```javascript { .api }
359
/**
360
* Base class for all pretrained models
361
*/
362
class PreTrainedModel {
363
/** Model configuration object */
364
config: any;
365
366
/**
367
* Run forward pass through the model
368
* @param model_inputs - Tokenized inputs or tensors
369
* @returns Promise resolving to model outputs
370
*/
371
async forward(model_inputs: any): Promise<any>;
372
373
/**
374
* Generate text using the model (for generation models)
375
* @param inputs - Input token IDs
376
* @param generation_config - Generation parameters
377
* @returns Promise resolving to generated token sequences
378
*/
379
async generate(
380
inputs: Tensor,
381
generation_config?: GenerationConfig
382
): Promise<Tensor[]>;
383
384
/**
385
* Dispose of model resources
386
*/
387
async dispose(): Promise<void>;
388
}
389
390
interface GenerationConfig {
391
/** Maximum number of new tokens to generate */
392
max_new_tokens?: number;
393
/** Maximum total length of generated sequence */
394
max_length?: number;
395
/** Minimum number of new tokens to generate */
396
min_new_tokens?: number;
397
/** Whether to use sampling for generation */
398
do_sample?: boolean;
399
/** Sampling temperature (0.0 to 1.0) */
400
temperature?: number;
401
/** Top-k sampling parameter */
402
top_k?: number;
403
/** Top-p (nucleus) sampling parameter */
404
top_p?: number;
405
/** Repetition penalty to avoid repetitive text */
406
repetition_penalty?: number;
407
/** Number of beams for beam search */
408
num_beams?: number;
409
/** Whether to use early stopping in beam search */
410
early_stopping?: boolean;
411
}
412
```
413
414
#### PreTrainedTokenizer
415
416
Base class for all tokenizer implementations providing text encoding and decoding functionality.
417
418
```javascript { .api }
419
/**
420
* Base class for all pretrained tokenizers
421
*/
422
class PreTrainedTokenizer {
423
/**
424
* Tokenize and encode text input
425
* @param text - Input text to tokenize
426
* @param options - Tokenization options
427
* @returns Tokenized output with input_ids and attention_mask
428
*/
429
async encode(
430
text: string | string[],
431
options?: TokenizeOptions
432
): Promise<{
433
input_ids: Tensor;
434
attention_mask: Tensor;
435
[key: string]: Tensor;
436
}>;
437
438
/**
439
* Tokenize text (alias for encode)
440
* @param text - Input text to tokenize
441
* @param options - Tokenization options
442
* @returns Tokenized output
443
*/
444
async __call__(
445
text: string | string[],
446
options?: TokenizeOptions
447
): Promise<{
448
input_ids: Tensor;
449
attention_mask: Tensor;
450
[key: string]: Tensor;
451
}>;
452
453
/**
454
* Decode token IDs back to text
455
* @param token_ids - Token IDs to decode
456
* @param options - Decoding options
457
* @returns Decoded text string
458
*/
459
decode(
460
token_ids: number[] | Tensor,
461
options?: DecodeOptions
462
): string;
463
464
/**
465
* Decode multiple sequences of token IDs
466
* @param sequences - Array of token ID sequences
467
* @param options - Decoding options
468
* @returns Array of decoded text strings
469
*/
470
batch_decode(
471
sequences: number[][] | Tensor[],
472
options?: DecodeOptions
473
): string[];
474
475
/**
476
* Get the vocabulary size
477
*/
478
get vocab_size(): number;
479
480
/**
481
* Dispose of tokenizer resources
482
*/
483
async dispose(): Promise<void>;
484
}
485
486
interface TokenizeOptions {
487
/** Add special tokens like [CLS], [SEP] */
488
add_special_tokens?: boolean;
489
/** Return attention mask */
490
return_attention_mask?: boolean;
491
/** Return token type IDs */
492
return_token_type_ids?: boolean;
493
/** Return tensor type ('pt' for PyTorch-like) */
494
return_tensors?: string;
495
/** Truncate to maximum length */
496
truncation?: boolean;
497
/** Maximum sequence length */
498
max_length?: number;
499
/** Padding strategy ('max_length', 'longest', etc.) */
500
padding?: boolean | string;
501
}
502
503
interface DecodeOptions {
504
/** Skip special tokens in decoded output */
505
skip_special_tokens?: boolean;
506
/** Clean up tokenization spaces */
507
clean_up_tokenization_spaces?: boolean;
508
}
509
```
510
511
### Specific Tokenizer Classes
512
513
The library includes numerous specific tokenizer implementations for different model architectures:
514
515
```javascript { .api }
516
// BERT family tokenizers
517
class BertTokenizer extends PreTrainedTokenizer {}
518
class DistilBertTokenizer extends PreTrainedTokenizer {}
519
class RobertaTokenizer extends PreTrainedTokenizer {}
520
class AlbertTokenizer extends PreTrainedTokenizer {}
521
522
// Transformer tokenizers
523
class T5Tokenizer extends PreTrainedTokenizer {}
524
class GPT2Tokenizer extends PreTrainedTokenizer {}
525
class BartTokenizer extends PreTrainedTokenizer {}
526
527
// Multilingual tokenizers
528
class XLMTokenizer extends PreTrainedTokenizer {}
529
class XLMRobertaTokenizer extends PreTrainedTokenizer {}
530
class MBartTokenizer extends PreTrainedTokenizer {}
531
class MBart50Tokenizer extends PreTrainedTokenizer {}
532
533
// Audio tokenizers
534
class WhisperTokenizer extends PreTrainedTokenizer {}
535
class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {}
536
class SpeechT5Tokenizer extends PreTrainedTokenizer {}
537
538
// Vision-language tokenizers
539
class CLIPTokenizer extends PreTrainedTokenizer {}
540
541
// And many more specific implementations...
542
```
543
544
## Usage Patterns
545
546
### Basic Model and Tokenizer Usage
547
548
```javascript
549
import { AutoModel, AutoTokenizer } from "@xenova/transformers";
550
551
// Load model and tokenizer
552
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased");
553
const model = await AutoModel.from_pretrained("Xenova/bert-base-uncased");
554
555
// Tokenize input
556
const inputs = await tokenizer("Hello, world!");
557
console.log(inputs.input_ids); // Tensor with token IDs
558
559
// Run inference
560
const outputs = await model(inputs);
561
console.log(outputs.last_hidden_state); // Model embeddings
562
```
563
564
### Text Generation
565
566
```javascript
567
import { AutoModelForCausalLM, AutoTokenizer } from "@xenova/transformers";
568
569
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2");
570
const model = await AutoModelForCausalLM.from_pretrained("Xenova/gpt2");
571
572
const inputs = await tokenizer("The future of AI is");
573
const outputs = await model.generate(inputs.input_ids, {
574
max_new_tokens: 50,
575
do_sample: true,
576
temperature: 0.7,
577
});
578
579
const generated_text = tokenizer.decode(outputs[0], {
580
skip_special_tokens: true,
581
});
582
console.log(generated_text);
583
```
584
585
### Sequence-to-Sequence Tasks
586
587
```javascript
588
import { AutoModelForSeq2SeqLM, AutoTokenizer } from "@xenova/transformers";
589
590
const tokenizer = await AutoTokenizer.from_pretrained("Xenova/t5-small");
591
const model = await AutoModelForSeq2SeqLM.from_pretrained("Xenova/t5-small");
592
593
const inputs = await tokenizer("translate English to German: I love transformers!");
594
const outputs = await model.generate(inputs.input_ids);
595
const translation = tokenizer.decode(outputs[0], {
596
skip_special_tokens: true,
597
});
598
console.log(translation); // "Ich liebe Transformatoren!"
599
```
600
601
### Resource Management
602
603
```javascript
604
// Always dispose of models and tokenizers when done
605
await model.dispose();
606
await tokenizer.dispose();
607
```
608
609
### Specific Tokenizer Classes
610
611
For advanced use cases that require direct access to specific tokenizer implementations, Transformers.js exports individual tokenizer classes.
612
613
#### Common Tokenizer Classes
614
615
Popular tokenizer implementations for direct instantiation when you need fine-grained control.
616
617
```javascript { .api }
618
/**
619
* BERT tokenizer with WordPiece tokenization
620
*/
621
class BertTokenizer extends PreTrainedTokenizer {
622
static async from_pretrained(
623
pretrained_model_name_or_path: string,
624
options?: TokenizerOptions
625
): Promise<BertTokenizer>;
626
}
627
628
/**
629
* GPT-2 tokenizer with BPE tokenization and chat template support
630
*/
631
class GPT2Tokenizer extends PreTrainedTokenizer {
632
static async from_pretrained(
633
pretrained_model_name_or_path: string,
634
options?: TokenizerOptions
635
): Promise<GPT2Tokenizer>;
636
637
/** Default chat template for conversation formatting */
638
get default_chat_template(): string;
639
}
640
641
/**
642
* T5 tokenizer for encoder-decoder models
643
*/
644
class T5Tokenizer extends PreTrainedTokenizer {
645
static async from_pretrained(
646
pretrained_model_name_or_path: string,
647
options?: TokenizerOptions
648
): Promise<T5Tokenizer>;
649
}
650
```
651
652
#### Language Model Tokenizers
653
654
Specialized tokenizers for modern language models.
655
656
```javascript { .api }
657
/**
658
* LLaMA tokenizer with chat template and legacy support
659
*/
660
class LlamaTokenizer extends PreTrainedTokenizer {
661
static async from_pretrained(
662
pretrained_model_name_or_path: string,
663
options?: TokenizerOptions
664
): Promise<LlamaTokenizer>;
665
666
/** Whether to use legacy behavior */
667
readonly legacy: boolean;
668
669
/** Whether to use default system prompt */
670
readonly use_default_system_prompt: boolean;
671
672
/** Get dynamic chat template with system prompt support */
673
get default_chat_template(): string;
674
675
/** Default system prompt for chat */
676
static readonly DEFAULT_SYSTEM_PROMPT: string;
677
}
678
679
/**
680
* Code Llama tokenizer (extends LlamaTokenizer)
681
*/
682
class CodeLlamaTokenizer extends LlamaTokenizer {
683
static async from_pretrained(
684
pretrained_model_name_or_path: string,
685
options?: TokenizerOptions
686
): Promise<CodeLlamaTokenizer>;
687
}
688
689
/**
690
* Gemma tokenizer with chat template support
691
*/
692
class GemmaTokenizer extends PreTrainedTokenizer {
693
static async from_pretrained(
694
pretrained_model_name_or_path: string,
695
options?: TokenizerOptions
696
): Promise<GemmaTokenizer>;
697
698
/** Default chat template for conversation formatting */
699
get default_chat_template(): string;
700
}
701
```
702
703
#### Audio and Speech Tokenizers
704
705
Tokenizers specialized for audio and speech processing models.
706
707
```javascript { .api }
708
/**
709
* Whisper tokenizer for automatic speech recognition
710
*/
711
class WhisperTokenizer extends PreTrainedTokenizer {
712
static async from_pretrained(
713
pretrained_model_name_or_path: string,
714
options?: TokenizerOptions
715
): Promise<WhisperTokenizer>;
716
717
/**
718
* Decode ASR sequences with timestamp support
719
* @param sequences - Token sequences to decode
720
* @param options - Decoding options including timestamp handling
721
*/
722
_decode_asr(
723
sequences: number[][],
724
options?: {
725
time_precision?: number;
726
return_timestamps?: boolean;
727
return_language?: boolean;
728
}
729
): string[];
730
731
/**
732
* Get decoder prompt IDs for language/task specification
733
* @param options - Language and task options
734
*/
735
get_decoder_prompt_ids(options?: {
736
language?: string;
737
task?: string;
738
no_timestamps?: boolean;
739
}): number[];
740
741
/**
742
* Combine tokens into words with language-specific boundary detection
743
* @param tokens - Array of token objects with timestamps
744
* @param language - Language code for boundary detection
745
*/
746
combineTokensIntoWords(
747
tokens: Array<{ text: string; timestamp: [number, number] }>,
748
language?: string
749
): Array<{ word: string; timestamp: [number, number] }>;
750
}
751
752
/**
753
* Wav2Vec2 CTC tokenizer for connectionist temporal classification
754
*/
755
class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer {
756
static async from_pretrained(
757
pretrained_model_name_or_path: string,
758
options?: TokenizerOptions
759
): Promise<Wav2Vec2CTCTokenizer>;
760
}
761
```
762
763
#### Multilingual and Translation Tokenizers
764
765
Tokenizers for multilingual models and translation tasks.
766
767
```javascript { .api }
768
/**
769
* XLM-RoBERTa multilingual tokenizer
770
*/
771
class XLMRobertaTokenizer extends PreTrainedTokenizer {
772
static async from_pretrained(
773
pretrained_model_name_or_path: string,
774
options?: TokenizerOptions
775
): Promise<XLMRobertaTokenizer>;
776
}
777
778
/**
779
* mBART tokenizer for multilingual translation
780
*/
781
class MBartTokenizer extends PreTrainedTokenizer {
782
static async from_pretrained(
783
pretrained_model_name_or_path: string,
784
options?: TokenizerOptions
785
): Promise<MBartTokenizer>;
786
787
/** Supported language codes */
788
readonly language_codes: string[];
789
790
/** Language code to token mapping */
791
readonly lang_to_token: Record<string, string>;
792
793
/**
794
* Build translation inputs with language tokens
795
* @param raw_inputs - Input text(s)
796
* @param tokenizer_options - Tokenization options
797
* @param generate_kwargs - Generation parameters including src_lang/tgt_lang
798
*/
799
_build_translation_inputs(
800
raw_inputs: string | string[],
801
tokenizer_options: any,
802
generate_kwargs: { src_lang?: string; tgt_lang?: string }
803
): any;
804
}
805
806
/**
807
* NLLB tokenizer for No Language Left Behind translation
808
*/
809
class NllbTokenizer extends PreTrainedTokenizer {
810
static async from_pretrained(
811
pretrained_model_name_or_path: string,
812
options?: TokenizerOptions
813
): Promise<NllbTokenizer>;
814
815
/**
816
* Build translation inputs with NLLB language codes
817
*/
818
_build_translation_inputs(
819
raw_inputs: string | string[],
820
tokenizer_options: any,
821
generate_kwargs: { src_lang?: string; tgt_lang?: string }
822
): any;
823
}
824
```
825
826
**Usage Examples:**
827
828
```javascript
829
import {
830
WhisperTokenizer,
831
LlamaTokenizer,
832
BertTokenizer
833
} from "@xenova/transformers";
834
835
// Direct tokenizer instantiation
836
const whisperTokenizer = await WhisperTokenizer.from_pretrained("openai/whisper-base");
837
const llamaTokenizer = await LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf");
838
839
// Access specialized methods
840
const promptIds = whisperTokenizer.get_decoder_prompt_ids({
841
language: "english",
842
task: "transcribe"
843
});
844
845
// Use chat templates
846
const chatTemplate = llamaTokenizer.default_chat_template;
847
const conversation = [
848
{ role: "user", content: "Hello!" },
849
{ role: "assistant", content: "Hi there!" }
850
];
851
const formatted = await llamaTokenizer.apply_chat_template(conversation);
852
```
853
854
## Types
855
856
```javascript { .api }
857
interface ModelOutput {
858
last_hidden_state?: Tensor;
859
logits?: Tensor;
860
hidden_states?: Tensor[];
861
attentions?: Tensor[];
862
[key: string]: any;
863
}
864
865
interface Tensor {
866
data: TypedArray;
867
dims: number[];
868
type: string;
869
size: number;
870
}
871
```