0
# Tokenizers
1
2
Comprehensive tokenization utilities for all supported transformer models, handling text preprocessing, encoding, decoding, and vocabulary management with model-specific tokenization strategies including WordPiece, BPE, and adaptive tokenization.
3
4
## Capabilities
5
6
### BERT Tokenizer
7
8
End-to-end BERT tokenizer combining punctuation splitting, lowercasing, and WordPiece tokenization for bidirectional language models.
9
10
```python { .api }
11
class BertTokenizer:
12
def __init__(
13
self,
14
vocab_file,
15
do_lower_case=True,
16
max_len=None,
17
do_basic_tokenize=True,
18
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
19
):
20
"""
21
Initialize BERT tokenizer.
22
23
Args:
24
vocab_file (str): Path to vocabulary file
25
do_lower_case (bool): Whether to lowercase input text
26
max_len (int, optional): Maximum sequence length
27
do_basic_tokenize (bool): Whether to do basic tokenization before WordPiece
28
never_split (tuple, optional): Tokens that should never be split
29
"""
30
31
def tokenize(self, text):
32
"""
33
Tokenize text into subword tokens.
34
35
Args:
36
text (str): Input text to tokenize
37
38
Returns:
39
list: List of subword tokens
40
"""
41
42
def convert_tokens_to_ids(self, tokens):
43
"""
44
Convert tokens to vocabulary IDs.
45
46
Args:
47
tokens (list): List of tokens
48
49
Returns:
50
list: List of token IDs
51
"""
52
53
def convert_ids_to_tokens(self, ids):
54
"""
55
Convert vocabulary IDs back to tokens.
56
57
Args:
58
ids (list): List of token IDs
59
60
Returns:
61
list: List of tokens
62
"""
63
64
def save_vocabulary(self, vocab_path):
65
"""
66
Save vocabulary to file.
67
68
Args:
69
vocab_path (str): Directory path to save vocabulary
70
71
Returns:
72
str: Path to saved vocabulary file
73
"""
74
75
@classmethod
76
def from_pretrained(
77
cls,
78
pretrained_model_name_or_path,
79
cache_dir=None,
80
do_lower_case=True,
81
**kwargs
82
):
83
"""
84
Load pre-trained BERT tokenizer.
85
86
Args:
87
pretrained_model_name_or_path (str): Model name or path
88
cache_dir (str, optional): Cache directory
89
do_lower_case (bool): Whether to lowercase
90
91
Returns:
92
BertTokenizer: Initialized tokenizer
93
"""
94
```
95
96
### Basic Tokenizer
97
98
Basic text tokenization handling punctuation splitting, accent stripping, and lowercasing.
99
100
```python { .api }
101
class BasicTokenizer:
102
def __init__(self, do_lower_case=True, never_split=None):
103
"""
104
Initialize basic tokenizer.
105
106
Args:
107
do_lower_case (bool): Whether to lowercase text
108
never_split (list, optional): Tokens never to split
109
"""
110
111
def tokenize(self, text):
112
"""
113
Perform basic tokenization on text.
114
115
Args:
116
text (str): Input text
117
118
Returns:
119
list: List of basic tokens
120
"""
121
```
122
123
### WordPiece Tokenizer
124
125
WordPiece subword tokenization using greedy longest-match-first algorithm for handling out-of-vocabulary tokens.
126
127
```python { .api }
128
class WordpieceTokenizer:
129
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
130
"""
131
Initialize WordPiece tokenizer.
132
133
Args:
134
vocab (dict): Vocabulary mapping tokens to IDs
135
unk_token (str): Unknown token symbol
136
max_input_chars_per_word (int): Maximum characters per word
137
"""
138
139
def tokenize(self, text):
140
"""
141
Perform WordPiece tokenization.
142
143
Args:
144
text (str): Input text
145
146
Returns:
147
list: List of WordPiece tokens
148
"""
149
```
150
151
### OpenAI GPT Tokenizer
152
153
Byte-pair encoding (BPE) tokenizer for OpenAI GPT models with special token support and text standardization.
154
155
```python { .api }
156
class OpenAIGPTTokenizer:
157
def __init__(
158
self,
159
vocab_file,
160
merges_file,
161
special_tokens=None,
162
max_len=None
163
):
164
"""
165
Initialize OpenAI GPT tokenizer.
166
167
Args:
168
vocab_file (str): Path to vocabulary JSON file
169
merges_file (str): Path to BPE merges file
170
special_tokens (list, optional): List of special tokens
171
max_len (int, optional): Maximum sequence length
172
"""
173
174
def tokenize(self, text):
175
"""
176
Perform BPE tokenization.
177
178
Args:
179
text (str): Input text
180
181
Returns:
182
list: List of BPE tokens
183
"""
184
185
def convert_tokens_to_ids(self, tokens):
186
"""Convert tokens to IDs."""
187
188
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
189
"""
190
Convert IDs to tokens.
191
192
Args:
193
ids (list): Token IDs
194
skip_special_tokens (bool): Whether to skip special tokens
195
196
Returns:
197
list: List of tokens
198
"""
199
200
def encode(self, text):
201
"""
202
Tokenize and convert to IDs in one step.
203
204
Args:
205
text (str): Input text
206
207
Returns:
208
list: List of token IDs
209
"""
210
211
def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
212
"""
213
Decode token IDs back to text.
214
215
Args:
216
ids (list): Token IDs
217
skip_special_tokens (bool): Whether to skip special tokens
218
clean_up_tokenization_spaces (bool): Whether to clean up spaces
219
220
Returns:
221
str: Decoded text
222
"""
223
224
def set_special_tokens(self, special_tokens):
225
"""
226
Add special tokens to vocabulary.
227
228
Args:
229
special_tokens (list): List of special tokens to add
230
"""
231
232
def save_vocabulary(self, vocab_path):
233
"""Save tokenizer vocabulary and merges files."""
234
235
@classmethod
236
def from_pretrained(
237
cls,
238
pretrained_model_name_or_path,
239
cache_dir=None,
240
**kwargs
241
):
242
"""Load pre-trained OpenAI GPT tokenizer."""
243
```
244
245
### GPT-2 Tokenizer
246
247
Byte-level BPE tokenizer for GPT-2 models with improved Unicode handling and robustness.
248
249
```python { .api }
250
class GPT2Tokenizer:
251
def __init__(
252
self,
253
vocab_file,
254
merges_file,
255
errors='replace',
256
special_tokens=None,
257
max_len=None
258
):
259
"""
260
Initialize GPT-2 tokenizer.
261
262
Args:
263
vocab_file (str): Path to vocabulary JSON file
264
merges_file (str): Path to BPE merges file
265
errors (str): Error handling for byte decoding
266
special_tokens (list, optional): Special tokens
267
max_len (int, optional): Maximum sequence length
268
"""
269
270
def tokenize(self, text):
271
"""Perform byte-level BPE tokenization."""
272
273
def convert_tokens_to_ids(self, tokens):
274
"""Convert tokens to IDs."""
275
276
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
277
"""Convert IDs to tokens."""
278
279
def encode(self, text):
280
"""Encode text to token IDs."""
281
282
def decode(self, tokens):
283
"""
284
Decode token IDs using byte-level encoding.
285
286
Args:
287
tokens (list): Token IDs or tokens
288
289
Returns:
290
str: Decoded text
291
"""
292
293
def save_vocabulary(self, vocab_path):
294
"""Save vocabulary files."""
295
296
@classmethod
297
def from_pretrained(
298
cls,
299
pretrained_model_name_or_path,
300
cache_dir=None,
301
**kwargs
302
):
303
"""Load pre-trained GPT-2 tokenizer."""
304
```
305
306
### Transformer-XL Tokenizer
307
308
Adaptive tokenizer for Transformer-XL with vocabulary building, corpus management, and flexible tokenization options.
309
310
```python { .api }
311
class TransfoXLTokenizer:
312
def __init__(
313
self,
314
special=None,
315
min_freq=0,
316
max_size=None,
317
lower_case=False,
318
delimiter=None,
319
vocab_file=None,
320
never_split=None
321
):
322
"""
323
Initialize Transformer-XL tokenizer.
324
325
Args:
326
special (list, optional): Special tokens
327
min_freq (int): Minimum frequency for vocabulary inclusion
328
max_size (int, optional): Maximum vocabulary size
329
lower_case (bool): Whether to lowercase text
330
delimiter (str, optional): Token delimiter
331
vocab_file (str, optional): Pre-built vocabulary file
332
never_split (list, optional): Tokens never to split
333
"""
334
335
def build_vocab(self):
336
"""Build vocabulary from counted tokens."""
337
338
def tokenize(self, line, add_eos=False, add_double_eos=False):
339
"""
340
Tokenize text line.
341
342
Args:
343
line (str): Input text line
344
add_eos (bool): Whether to add end-of-sequence token
345
add_double_eos (bool): Whether to add double EOS tokens
346
347
Returns:
348
list: List of tokens
349
"""
350
351
def encode_file(self, path, ordered=False, verbose=False):
352
"""
353
Encode entire file to token IDs.
354
355
Args:
356
path (str): File path
357
ordered (bool): Whether to maintain order
358
verbose (bool): Whether to show progress
359
360
Returns:
361
torch.Tensor: Encoded token IDs
362
"""
363
364
def convert_tokens_to_ids(self, symbols):
365
"""Convert tokens to vocabulary IDs."""
366
367
def convert_ids_to_tokens(self, indices):
368
"""Convert IDs to tokens."""
369
370
@classmethod
371
def from_pretrained(
372
cls,
373
pretrained_model_name_or_path,
374
cache_dir=None,
375
**kwargs
376
):
377
"""Load pre-trained Transformer-XL tokenizer."""
378
```
379
380
### Transformer-XL Corpus
381
382
Corpus management class for Transformer-XL providing dataset loading, vocabulary building, and data iteration.
383
384
```python { .api }
385
class TransfoXLCorpus:
386
def __init__(self, path, dataset, *args, **kwargs):
387
"""
388
Initialize corpus manager.
389
390
Args:
391
path (str): Dataset path
392
dataset (str): Dataset name
393
"""
394
395
def build_corpus(self, path, dataset):
396
"""
397
Build corpus from dataset.
398
399
Args:
400
path (str): Dataset path
401
dataset (str): Dataset name
402
"""
403
404
def get_iterator(self, split, *args, **kwargs):
405
"""
406
Get data iterator for specified split.
407
408
Args:
409
split (str): Dataset split ('train', 'valid', 'test')
410
411
Returns:
412
Iterator: Data iterator
413
"""
414
```
415
416
## Utility Functions
417
418
```python { .api }
419
def load_vocab(vocab_file):
420
"""
421
Load vocabulary file into ordered dictionary.
422
423
Args:
424
vocab_file (str): Path to vocabulary file
425
426
Returns:
427
collections.OrderedDict: Token to ID mapping
428
"""
429
430
def whitespace_tokenize(text):
431
"""
432
Basic whitespace tokenization.
433
434
Args:
435
text (str): Input text
436
437
Returns:
438
list: Whitespace-separated tokens
439
"""
440
441
def get_pairs(word):
442
"""
443
Get symbol pairs in word for BPE processing.
444
445
Args:
446
word (tuple): Word as tuple of symbols
447
448
Returns:
449
set: Set of symbol pairs
450
"""
451
452
def text_standardize(text):
453
"""
454
Standardize text by fixing punctuation and spacing.
455
456
Args:
457
text (str): Input text
458
459
Returns:
460
str: Standardized text
461
"""
462
463
def bytes_to_unicode():
464
"""
465
Create mapping from UTF-8 bytes to unicode strings for GPT-2.
466
467
Returns:
468
dict: Byte to unicode mapping
469
"""
470
471
def get_lm_corpus(datadir, dataset):
472
"""
473
Get language model corpus for Transformer-XL.
474
475
Args:
476
datadir (str): Data directory
477
dataset (str): Dataset name
478
479
Returns:
480
TransfoXLCorpus: Corpus instance
481
"""
482
```
483
484
## Usage Examples
485
486
### BERT Tokenization
487
488
```python
489
from pytorch_pretrained_bert import BertTokenizer
490
491
# Load pre-trained tokenizer
492
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
493
494
# Tokenize text
495
text = "Hello world! This is BERT tokenization."
496
tokens = tokenizer.tokenize(text)
497
print(tokens) # ['hello', 'world', '!', 'this', 'is', 'bert', 'token', '##ization', '.']
498
499
# Convert to IDs
500
input_ids = tokenizer.convert_tokens_to_ids(tokens)
501
print(input_ids) # [7592, 2088, 999, 2023, 2003, 14324, 19204, 6851, 1012]
502
503
# Convert back to tokens
504
recovered_tokens = tokenizer.convert_ids_to_tokens(input_ids)
505
print(recovered_tokens)
506
```
507
508
### GPT-2 Tokenization and Encoding
509
510
```python
511
from pytorch_pretrained_bert import GPT2Tokenizer
512
513
# Load tokenizer
514
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
515
516
# Direct encoding and decoding
517
text = "The future of AI is bright."
518
encoded = tokenizer.encode(text)
519
print(encoded) # [464, 2003, 286, 9552, 318, 6016, 13]
520
521
decoded = tokenizer.decode(encoded)
522
print(decoded) # "The future of AI is bright."
523
```
524
525
### Transformer-XL with Custom Vocabulary
526
527
```python
528
from pytorch_pretrained_bert import TransfoXLTokenizer
529
530
# Initialize tokenizer with custom settings
531
tokenizer = TransfoXLTokenizer(
532
special=['<eos>', '<unk>'],
533
min_freq=3,
534
lower_case=True
535
)
536
537
# Tokenize with special tokens
538
text = "This is a sample sentence."
539
tokens = tokenizer.tokenize(text, add_eos=True)
540
print(tokens) # ['this', 'is', 'a', 'sample', 'sentence', '.', '<eos>']
541
```
542
543
### OpenAI GPT with Special Tokens
544
545
```python
546
from pytorch_pretrained_bert import OpenAIGPTTokenizer
547
548
# Load tokenizer
549
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
550
551
# Add special tokens
552
special_tokens = ['<start>', '<end>']
553
tokenizer.set_special_tokens(special_tokens)
554
555
# Use special tokens
556
text = "<start> Generate some text <end>"
557
tokens = tokenizer.tokenize(text)
558
print(tokens)
559
```