or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

bert-models.mdgpt-models.mdindex.mdoptimizers.mdtokenizers.mdutilities.md

tokenizers.mddocs/

0

# Tokenizers

1

2

Comprehensive tokenization utilities for all supported transformer models, handling text preprocessing, encoding, decoding, and vocabulary management with model-specific tokenization strategies including WordPiece, BPE, and adaptive tokenization.

3

4

## Capabilities

5

6

### BERT Tokenizer

7

8

End-to-end BERT tokenizer combining punctuation splitting, lowercasing, and WordPiece tokenization for bidirectional language models.

9

10

```python { .api }

11

class BertTokenizer:

12

def __init__(

13

self,

14

vocab_file,

15

do_lower_case=True,

16

max_len=None,

17

do_basic_tokenize=True,

18

never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")

19

):

20

"""

21

Initialize BERT tokenizer.

22

23

Args:

24

vocab_file (str): Path to vocabulary file

25

do_lower_case (bool): Whether to lowercase input text

26

max_len (int, optional): Maximum sequence length

27

do_basic_tokenize (bool): Whether to do basic tokenization before WordPiece

28

never_split (tuple, optional): Tokens that should never be split

29

"""

30

31

def tokenize(self, text):

32

"""

33

Tokenize text into subword tokens.

34

35

Args:

36

text (str): Input text to tokenize

37

38

Returns:

39

list: List of subword tokens

40

"""

41

42

def convert_tokens_to_ids(self, tokens):

43

"""

44

Convert tokens to vocabulary IDs.

45

46

Args:

47

tokens (list): List of tokens

48

49

Returns:

50

list: List of token IDs

51

"""

52

53

def convert_ids_to_tokens(self, ids):

54

"""

55

Convert vocabulary IDs back to tokens.

56

57

Args:

58

ids (list): List of token IDs

59

60

Returns:

61

list: List of tokens

62

"""

63

64

def save_vocabulary(self, vocab_path):

65

"""

66

Save vocabulary to file.

67

68

Args:

69

vocab_path (str): Directory path to save vocabulary

70

71

Returns:

72

str: Path to saved vocabulary file

73

"""

74

75

@classmethod

76

def from_pretrained(

77

cls,

78

pretrained_model_name_or_path,

79

cache_dir=None,

80

do_lower_case=True,

81

**kwargs

82

):

83

"""

84

Load pre-trained BERT tokenizer.

85

86

Args:

87

pretrained_model_name_or_path (str): Model name or path

88

cache_dir (str, optional): Cache directory

89

do_lower_case (bool): Whether to lowercase

90

91

Returns:

92

BertTokenizer: Initialized tokenizer

93

"""

94

```

95

96

### Basic Tokenizer

97

98

Basic text tokenization handling punctuation splitting, accent stripping, and lowercasing.

99

100

```python { .api }

101

class BasicTokenizer:

102

def __init__(self, do_lower_case=True, never_split=None):

103

"""

104

Initialize basic tokenizer.

105

106

Args:

107

do_lower_case (bool): Whether to lowercase text

108

never_split (list, optional): Tokens never to split

109

"""

110

111

def tokenize(self, text):

112

"""

113

Perform basic tokenization on text.

114

115

Args:

116

text (str): Input text

117

118

Returns:

119

list: List of basic tokens

120

"""

121

```

122

123

### WordPiece Tokenizer

124

125

WordPiece subword tokenization using greedy longest-match-first algorithm for handling out-of-vocabulary tokens.

126

127

```python { .api }

128

class WordpieceTokenizer:

129

def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):

130

"""

131

Initialize WordPiece tokenizer.

132

133

Args:

134

vocab (dict): Vocabulary mapping tokens to IDs

135

unk_token (str): Unknown token symbol

136

max_input_chars_per_word (int): Maximum characters per word

137

"""

138

139

def tokenize(self, text):

140

"""

141

Perform WordPiece tokenization.

142

143

Args:

144

text (str): Input text

145

146

Returns:

147

list: List of WordPiece tokens

148

"""

149

```

150

151

### OpenAI GPT Tokenizer

152

153

Byte-pair encoding (BPE) tokenizer for OpenAI GPT models with special token support and text standardization.

154

155

```python { .api }

156

class OpenAIGPTTokenizer:

157

def __init__(

158

self,

159

vocab_file,

160

merges_file,

161

special_tokens=None,

162

max_len=None

163

):

164

"""

165

Initialize OpenAI GPT tokenizer.

166

167

Args:

168

vocab_file (str): Path to vocabulary JSON file

169

merges_file (str): Path to BPE merges file

170

special_tokens (list, optional): List of special tokens

171

max_len (int, optional): Maximum sequence length

172

"""

173

174

def tokenize(self, text):

175

"""

176

Perform BPE tokenization.

177

178

Args:

179

text (str): Input text

180

181

Returns:

182

list: List of BPE tokens

183

"""

184

185

def convert_tokens_to_ids(self, tokens):

186

"""Convert tokens to IDs."""

187

188

def convert_ids_to_tokens(self, ids, skip_special_tokens=False):

189

"""

190

Convert IDs to tokens.

191

192

Args:

193

ids (list): Token IDs

194

skip_special_tokens (bool): Whether to skip special tokens

195

196

Returns:

197

list: List of tokens

198

"""

199

200

def encode(self, text):

201

"""

202

Tokenize and convert to IDs in one step.

203

204

Args:

205

text (str): Input text

206

207

Returns:

208

list: List of token IDs

209

"""

210

211

def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):

212

"""

213

Decode token IDs back to text.

214

215

Args:

216

ids (list): Token IDs

217

skip_special_tokens (bool): Whether to skip special tokens

218

clean_up_tokenization_spaces (bool): Whether to clean up spaces

219

220

Returns:

221

str: Decoded text

222

"""

223

224

def set_special_tokens(self, special_tokens):

225

"""

226

Add special tokens to vocabulary.

227

228

Args:

229

special_tokens (list): List of special tokens to add

230

"""

231

232

def save_vocabulary(self, vocab_path):

233

"""Save tokenizer vocabulary and merges files."""

234

235

@classmethod

236

def from_pretrained(

237

cls,

238

pretrained_model_name_or_path,

239

cache_dir=None,

240

**kwargs

241

):

242

"""Load pre-trained OpenAI GPT tokenizer."""

243

```

244

245

### GPT-2 Tokenizer

246

247

Byte-level BPE tokenizer for GPT-2 models with improved Unicode handling and robustness.

248

249

```python { .api }

250

class GPT2Tokenizer:

251

def __init__(

252

self,

253

vocab_file,

254

merges_file,

255

errors='replace',

256

special_tokens=None,

257

max_len=None

258

):

259

"""

260

Initialize GPT-2 tokenizer.

261

262

Args:

263

vocab_file (str): Path to vocabulary JSON file

264

merges_file (str): Path to BPE merges file

265

errors (str): Error handling for byte decoding

266

special_tokens (list, optional): Special tokens

267

max_len (int, optional): Maximum sequence length

268

"""

269

270

def tokenize(self, text):

271

"""Perform byte-level BPE tokenization."""

272

273

def convert_tokens_to_ids(self, tokens):

274

"""Convert tokens to IDs."""

275

276

def convert_ids_to_tokens(self, ids, skip_special_tokens=False):

277

"""Convert IDs to tokens."""

278

279

def encode(self, text):

280

"""Encode text to token IDs."""

281

282

def decode(self, tokens):

283

"""

284

Decode token IDs using byte-level encoding.

285

286

Args:

287

tokens (list): Token IDs or tokens

288

289

Returns:

290

str: Decoded text

291

"""

292

293

def save_vocabulary(self, vocab_path):

294

"""Save vocabulary files."""

295

296

@classmethod

297

def from_pretrained(

298

cls,

299

pretrained_model_name_or_path,

300

cache_dir=None,

301

**kwargs

302

):

303

"""Load pre-trained GPT-2 tokenizer."""

304

```

305

306

### Transformer-XL Tokenizer

307

308

Adaptive tokenizer for Transformer-XL with vocabulary building, corpus management, and flexible tokenization options.

309

310

```python { .api }

311

class TransfoXLTokenizer:

312

def __init__(

313

self,

314

special=None,

315

min_freq=0,

316

max_size=None,

317

lower_case=False,

318

delimiter=None,

319

vocab_file=None,

320

never_split=None

321

):

322

"""

323

Initialize Transformer-XL tokenizer.

324

325

Args:

326

special (list, optional): Special tokens

327

min_freq (int): Minimum frequency for vocabulary inclusion

328

max_size (int, optional): Maximum vocabulary size

329

lower_case (bool): Whether to lowercase text

330

delimiter (str, optional): Token delimiter

331

vocab_file (str, optional): Pre-built vocabulary file

332

never_split (list, optional): Tokens never to split

333

"""

334

335

def build_vocab(self):

336

"""Build vocabulary from counted tokens."""

337

338

def tokenize(self, line, add_eos=False, add_double_eos=False):

339

"""

340

Tokenize text line.

341

342

Args:

343

line (str): Input text line

344

add_eos (bool): Whether to add end-of-sequence token

345

add_double_eos (bool): Whether to add double EOS tokens

346

347

Returns:

348

list: List of tokens

349

"""

350

351

def encode_file(self, path, ordered=False, verbose=False):

352

"""

353

Encode entire file to token IDs.

354

355

Args:

356

path (str): File path

357

ordered (bool): Whether to maintain order

358

verbose (bool): Whether to show progress

359

360

Returns:

361

torch.Tensor: Encoded token IDs

362

"""

363

364

def convert_tokens_to_ids(self, symbols):

365

"""Convert tokens to vocabulary IDs."""

366

367

def convert_ids_to_tokens(self, indices):

368

"""Convert IDs to tokens."""

369

370

@classmethod

371

def from_pretrained(

372

cls,

373

pretrained_model_name_or_path,

374

cache_dir=None,

375

**kwargs

376

):

377

"""Load pre-trained Transformer-XL tokenizer."""

378

```

379

380

### Transformer-XL Corpus

381

382

Corpus management class for Transformer-XL providing dataset loading, vocabulary building, and data iteration.

383

384

```python { .api }

385

class TransfoXLCorpus:

386

def __init__(self, path, dataset, *args, **kwargs):

387

"""

388

Initialize corpus manager.

389

390

Args:

391

path (str): Dataset path

392

dataset (str): Dataset name

393

"""

394

395

def build_corpus(self, path, dataset):

396

"""

397

Build corpus from dataset.

398

399

Args:

400

path (str): Dataset path

401

dataset (str): Dataset name

402

"""

403

404

def get_iterator(self, split, *args, **kwargs):

405

"""

406

Get data iterator for specified split.

407

408

Args:

409

split (str): Dataset split ('train', 'valid', 'test')

410

411

Returns:

412

Iterator: Data iterator

413

"""

414

```

415

416

## Utility Functions

417

418

```python { .api }

419

def load_vocab(vocab_file):

420

"""

421

Load vocabulary file into ordered dictionary.

422

423

Args:

424

vocab_file (str): Path to vocabulary file

425

426

Returns:

427

collections.OrderedDict: Token to ID mapping

428

"""

429

430

def whitespace_tokenize(text):

431

"""

432

Basic whitespace tokenization.

433

434

Args:

435

text (str): Input text

436

437

Returns:

438

list: Whitespace-separated tokens

439

"""

440

441

def get_pairs(word):

442

"""

443

Get symbol pairs in word for BPE processing.

444

445

Args:

446

word (tuple): Word as tuple of symbols

447

448

Returns:

449

set: Set of symbol pairs

450

"""

451

452

def text_standardize(text):

453

"""

454

Standardize text by fixing punctuation and spacing.

455

456

Args:

457

text (str): Input text

458

459

Returns:

460

str: Standardized text

461

"""

462

463

def bytes_to_unicode():

464

"""

465

Create mapping from UTF-8 bytes to unicode strings for GPT-2.

466

467

Returns:

468

dict: Byte to unicode mapping

469

"""

470

471

def get_lm_corpus(datadir, dataset):

472

"""

473

Get language model corpus for Transformer-XL.

474

475

Args:

476

datadir (str): Data directory

477

dataset (str): Dataset name

478

479

Returns:

480

TransfoXLCorpus: Corpus instance

481

"""

482

```

483

484

## Usage Examples

485

486

### BERT Tokenization

487

488

```python

489

from pytorch_pretrained_bert import BertTokenizer

490

491

# Load pre-trained tokenizer

492

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

493

494

# Tokenize text

495

text = "Hello world! This is BERT tokenization."

496

tokens = tokenizer.tokenize(text)

497

print(tokens) # ['hello', 'world', '!', 'this', 'is', 'bert', 'token', '##ization', '.']

498

499

# Convert to IDs

500

input_ids = tokenizer.convert_tokens_to_ids(tokens)

501

print(input_ids) # [7592, 2088, 999, 2023, 2003, 14324, 19204, 6851, 1012]

502

503

# Convert back to tokens

504

recovered_tokens = tokenizer.convert_ids_to_tokens(input_ids)

505

print(recovered_tokens)

506

```

507

508

### GPT-2 Tokenization and Encoding

509

510

```python

511

from pytorch_pretrained_bert import GPT2Tokenizer

512

513

# Load tokenizer

514

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

515

516

# Direct encoding and decoding

517

text = "The future of AI is bright."

518

encoded = tokenizer.encode(text)

519

print(encoded) # [464, 2003, 286, 9552, 318, 6016, 13]

520

521

decoded = tokenizer.decode(encoded)

522

print(decoded) # "The future of AI is bright."

523

```

524

525

### Transformer-XL with Custom Vocabulary

526

527

```python

528

from pytorch_pretrained_bert import TransfoXLTokenizer

529

530

# Initialize tokenizer with custom settings

531

tokenizer = TransfoXLTokenizer(

532

special=['<eos>', '<unk>'],

533

min_freq=3,

534

lower_case=True

535

)

536

537

# Tokenize with special tokens

538

text = "This is a sample sentence."

539

tokens = tokenizer.tokenize(text, add_eos=True)

540

print(tokens) # ['this', 'is', 'a', 'sample', 'sentence', '.', '<eos>']

541

```

542

543

### OpenAI GPT with Special Tokens

544

545

```python

546

from pytorch_pretrained_bert import OpenAIGPTTokenizer

547

548

# Load tokenizer

549

tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

550

551

# Add special tokens

552

special_tokens = ['<start>', '<end>']

553

tokenizer.set_special_tokens(special_tokens)

554

555

# Use special tokens

556

text = "<start> Generate some text <end>"

557

tokens = tokenizer.tokenize(text)

558

print(tokens)

559

```