or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-compression.mdadvanced-decompression.mdbuffer-operations.mddictionary-compression.mdframe-analysis.mdindex.mdsimple-operations.md

dictionary-compression.mddocs/

0

# Dictionary Compression

1

2

Training and using custom dictionaries to achieve better compression ratios on similar data sets by learning common patterns and structures.

3

4

## Capabilities

5

6

### Dictionary Training

7

8

Create custom compression dictionaries from sample data to improve compression ratios for similar data.

9

10

```python { .api }

11

def train_dictionary(

12

dict_size: int,

13

samples: list[bytes],

14

k: int = 0,

15

d: int = 8,

16

f: int = 20,

17

split_point: float = 1.0,

18

accel: int = 1,

19

notifications: int = 0,

20

dict_id: int = 0,

21

level: int = 3,

22

steps: int = 4,

23

threads: int = 0

24

) -> ZstdCompressionDict:

25

"""

26

Train a compression dictionary from sample data.

27

28

Parameters:

29

- dict_size: int, target dictionary size in bytes

30

- samples: list[bytes], sample data for training

31

- k: int, segment size parameter (0 = auto)

32

- d: int, dmer size parameter (6-16, default 8)

33

- f: int, log frequency parameter (default 20)

34

- split_point: float, split point for training (0.0-1.0)

35

- accel: int, acceleration parameter (1-10)

36

- notifications: int, notification level (0=none, 1=basic, 2=verbose)

37

- dict_id: int, dictionary ID (0 = auto-generate)

38

- level: int, compression level for dictionary optimization

39

- steps: int, training steps (1-4)

40

- threads: int, number of threads (0 = auto)

41

42

Returns:

43

ZstdCompressionDict: Trained compression dictionary

44

"""

45

```

46

47

**Usage Example:**

48

49

```python

50

import zstandard as zstd

51

52

# Prepare sample data for training

53

samples = [

54

b'{"name": "John", "age": 30, "city": "New York"}',

55

b'{"name": "Jane", "age": 25, "city": "San Francisco"}',

56

b'{"name": "Bob", "age": 35, "city": "Chicago"}',

57

b'{"name": "Alice", "age": 28, "city": "Boston"}',

58

# ... more similar JSON documents

59

]

60

61

# Train dictionary

62

dictionary = zstd.train_dictionary(

63

dict_size=8192, # 8KB dictionary

64

samples=samples,

65

level=5,

66

threads=4

67

)

68

69

print(f"Dictionary size: {len(dictionary)} bytes")

70

print(f"Dictionary ID: {dictionary.dict_id()}")

71

```

72

73

### Dictionary Object

74

75

Container for compression dictionaries with metadata and optimization capabilities.

76

77

```python { .api }

78

class ZstdCompressionDict:

79

def __init__(

80

self,

81

data: bytes,

82

dict_type: int = DICT_TYPE_AUTO,

83

k: int = 0,

84

d: int = 0

85

):

86

"""

87

Create a compression dictionary from raw dictionary data.

88

89

Parameters:

90

- data: bytes, raw dictionary data

91

- dict_type: int, dictionary type (DICT_TYPE_AUTO, DICT_TYPE_RAWCONTENT, DICT_TYPE_FULLDICT)

92

- k: int, segment size parameter

93

- d: int, dmer size parameter

94

"""

95

96

def __len__(self) -> int:

97

"""Get dictionary size in bytes."""

98

99

def dict_id(self) -> int:

100

"""

101

Get dictionary ID.

102

103

Returns:

104

int: Dictionary identifier

105

"""

106

107

def as_bytes(self) -> bytes:

108

"""

109

Get dictionary data as bytes.

110

111

Returns:

112

bytes: Raw dictionary data

113

"""

114

115

def precompute_compress(

116

self,

117

level: int = 3,

118

compression_params: ZstdCompressionParameters = None

119

):

120

"""

121

Precompute compression tables for better performance.

122

123

Parameters:

124

- level: int, compression level to optimize for

125

- compression_params: ZstdCompressionParameters, detailed parameters

126

"""

127

128

# Properties

129

k: int # Segment size parameter

130

d: int # Dmer size parameter

131

```

132

133

**Usage Example:**

134

135

```python

136

import zstandard as zstd

137

138

# Load dictionary from file

139

with open('dictionary.zdict', 'rb') as f:

140

dict_data = f.read()

141

142

# Create dictionary object

143

dictionary = zstd.ZstdCompressionDict(dict_data)

144

145

# Optimize for specific compression level

146

dictionary.precompute_compress(level=9)

147

148

# Get dictionary information

149

print(f"Dictionary size: {len(dictionary)} bytes")

150

print(f"Dictionary ID: {dictionary.dict_id()}")

151

print(f"Parameters: k={dictionary.k}, d={dictionary.d}")

152

153

# Save optimized dictionary

154

optimized_data = dictionary.as_bytes()

155

with open('optimized_dictionary.zdict', 'wb') as f:

156

f.write(optimized_data)

157

```

158

159

### Dictionary Compression

160

161

Using dictionaries with compressors to achieve better compression ratios.

162

163

```python { .api }

164

class ZstdCompressor:

165

def __init__(

166

self,

167

level: int = 3,

168

dict_data: ZstdCompressionDict = None,

169

**kwargs

170

):

171

"""

172

Create compressor with dictionary support.

173

174

Parameters:

175

- level: int, compression level

176

- dict_data: ZstdCompressionDict, compression dictionary

177

- **kwargs: other compressor parameters

178

"""

179

```

180

181

**Usage Example:**

182

183

```python

184

import zstandard as zstd

185

186

# Train dictionary from samples

187

samples = [b"sample data 1", b"sample data 2", b"sample data 3"]

188

dictionary = zstd.train_dictionary(4096, samples)

189

190

# Create compressor with dictionary

191

compressor = zstd.ZstdCompressor(level=5, dict_data=dictionary)

192

193

# Compress new data (similar to training samples)

194

new_data = b"new sample data with similar patterns"

195

compressed = compressor.compress(new_data)

196

197

# Compare compression ratios

198

compressor_no_dict = zstd.ZstdCompressor(level=5)

199

compressed_no_dict = compressor_no_dict.compress(new_data)

200

201

print(f"With dictionary: {len(compressed)} bytes")

202

print(f"Without dictionary: {len(compressed_no_dict)} bytes")

203

print(f"Improvement: {(len(compressed_no_dict) - len(compressed)) / len(compressed_no_dict) * 100:.1f}%")

204

```

205

206

### Dictionary Decompression

207

208

Using dictionaries with decompressors to decompress dictionary-compressed data.

209

210

```python { .api }

211

class ZstdDecompressor:

212

def __init__(

213

self,

214

dict_data: ZstdCompressionDict = None,

215

**kwargs

216

):

217

"""

218

Create decompressor with dictionary support.

219

220

Parameters:

221

- dict_data: ZstdCompressionDict, decompression dictionary

222

- **kwargs: other decompressor parameters

223

"""

224

```

225

226

**Usage Example:**

227

228

```python

229

import zstandard as zstd

230

231

# Load dictionary (same as used for compression)

232

dictionary = zstd.ZstdCompressionDict(dict_data)

233

234

# Create decompressor with dictionary

235

decompressor = zstd.ZstdDecompressor(dict_data=dictionary)

236

237

# Decompress dictionary-compressed data

238

decompressed = decompressor.decompress(compressed_data)

239

print(f"Decompressed: {decompressed}")

240

```

241

242

### Dictionary Types

243

244

Different dictionary types for various use cases and compatibility requirements.

245

246

```python { .api }

247

# Dictionary type constants

248

DICT_TYPE_AUTO: int # Auto-detect dictionary type

249

DICT_TYPE_RAWCONTENT: int # Raw content dictionary

250

DICT_TYPE_FULLDICT: int # Full dictionary with headers

251

```

252

253

**Usage Example:**

254

255

```python

256

import zstandard as zstd

257

258

# Raw content dictionary (just the sample data)

259

raw_dict = zstd.ZstdCompressionDict(

260

sample_data,

261

dict_type=zstd.DICT_TYPE_RAWCONTENT

262

)

263

264

# Full dictionary (with zstd dictionary headers)

265

full_dict = zstd.ZstdCompressionDict(

266

trained_dict_data,

267

dict_type=zstd.DICT_TYPE_FULLDICT

268

)

269

270

# Auto-detect type

271

auto_dict = zstd.ZstdCompressionDict(

272

dict_data,

273

dict_type=zstd.DICT_TYPE_AUTO

274

)

275

```

276

277

### Dictionary Best Practices

278

279

Guidelines for effective dictionary usage:

280

281

**Training Data Selection:**

282

283

```python

284

import zstandard as zstd

285

286

# Use representative samples

287

samples = collect_representative_data()

288

289

# Aim for 100-1000 samples, each 1KB-64KB

290

filtered_samples = [s for s in samples if 1024 <= len(s) <= 65536]

291

292

# Dictionary size: typically 64KB-1MB

293

dictionary = zstd.train_dictionary(

294

dict_size=min(65536, sum(len(s) for s in filtered_samples) // 100),

295

samples=filtered_samples

296

)

297

```

298

299

**Performance Optimization:**

300

301

```python

302

import zstandard as zstd

303

304

# Precompute tables for target compression level

305

dictionary.precompute_compress(level=compression_level)

306

307

# Reuse compressor/decompressor objects

308

compressor = zstd.ZstdCompressor(level=5, dict_data=dictionary)

309

decompressor = zstd.ZstdDecompressor(dict_data=dictionary)

310

311

# Compress multiple items efficiently

312

for data in data_items:

313

compressed = compressor.compress(data)

314

process_compressed(compressed)

315

```

316

317

**Dictionary Persistence:**

318

319

```python

320

import zstandard as zstd

321

322

# Save dictionary for later use

323

dictionary_data = dictionary.as_bytes()

324

with open('model.zdict', 'wb') as f:

325

f.write(dictionary_data)

326

327

# Load dictionary

328

with open('model.zdict', 'rb') as f:

329

dict_data = f.read()

330

dictionary = zstd.ZstdCompressionDict(dict_data)

331

```

332

333

## Performance Considerations

334

335

- Dictionary training is CPU-intensive but only done once

336

- Dictionary size affects both compression ratio and memory usage

337

- Precomputed dictionaries improve compression performance

338

- Dictionary effectiveness depends on similarity between training and target data

339

- Larger dictionaries generally provide better compression but use more memory