0
# Configuration and Constants
1
2
Configuration constants for optimizing trie performance, memory usage, and behavior. These constants control cache sizes, node ordering, tail storage methods, and trie count limits to fine-tune performance for specific use cases.
3
4
## Capabilities
5
6
### Cache Size Configuration
7
8
Controls the cache size used during trie construction and queries, affecting both performance and memory usage during operations.
9
10
```python { .api }
11
# Cache size constants (in order of size)
12
DEFAULT_CACHE: int # Default cache size for balanced performance
13
HUGE_CACHE: int # Largest cache for maximum performance
14
LARGE_CACHE: int # Large cache for high-performance scenarios
15
NORMAL_CACHE: int # Normal cache size
16
SMALL_CACHE: int # Small cache for memory-constrained environments
17
TINY_CACHE: int # Smallest cache for minimal memory usage
18
```
19
20
### Node Ordering Configuration
21
22
Determines how nodes are arranged within the trie structure, affecting both lookup performance and the order of iteration results.
23
24
```python { .api }
25
# Node ordering constants
26
LABEL_ORDER: int # Arrange nodes in ascending label order (predictable iteration)
27
WEIGHT_ORDER: int # Arrange nodes in descending weight order (faster matching)
28
DEFAULT_ORDER: int # Default node ordering strategy
29
```
30
31
### Tail Storage Configuration
32
33
Controls how the trie stores the tail portions of keys, affecting memory usage and compatibility with different data types.
34
35
```python { .api }
36
# Tail storage method constants
37
TEXT_TAIL: int # Store tails as null-terminated strings (text data)
38
BINARY_TAIL: int # Store tails as byte sequences with bit vectors (binary data)
39
DEFAULT_TAIL: int # Default tail storage method
40
```
41
42
### Trie Count Limits
43
44
Defines the valid range for the number of tries used in the underlying MARISA-trie structure.
45
46
```python { .api }
47
# Trie count constants
48
MIN_NUM_TRIES: int # Minimum number of tries allowed
49
MAX_NUM_TRIES: int # Maximum number of tries allowed
50
DEFAULT_NUM_TRIES: int # Default number of tries
51
```
52
53
## Usage Examples
54
55
### Performance Optimization
56
57
```python
58
import marisa_trie
59
60
# High-performance configuration for frequent lookups
61
fast_trie = marisa_trie.Trie(
62
['frequent', 'lookups', 'data'],
63
cache_size=marisa_trie.HUGE_CACHE, # Maximum cache for speed
64
order=marisa_trie.WEIGHT_ORDER, # Optimize for lookup performance
65
num_tries=7 # Higher trie count for speed
66
)
67
68
# Memory-efficient configuration
69
compact_trie = marisa_trie.Trie(
70
['memory', 'efficient', 'storage'],
71
cache_size=marisa_trie.TINY_CACHE, # Minimal cache for memory savings
72
binary=True, # Use binary tail storage
73
num_tries=marisa_trie.MIN_NUM_TRIES # Minimum tries for space
74
)
75
```
76
77
### Data Type Optimization
78
79
```python
80
# Optimize for text data with predictable ordering
81
text_trie = marisa_trie.Trie(
82
sorted_word_list,
83
order=marisa_trie.LABEL_ORDER, # Maintain alphabetical order
84
binary=False # Use text tail storage (default)
85
)
86
87
# Optimize for binary data or data with null bytes
88
binary_trie = marisa_trie.BinaryTrie(
89
binary_keys,
90
binary=True, # Force binary tail storage
91
cache_size=marisa_trie.LARGE_CACHE
92
)
93
```
94
95
### Weighted Optimization
96
97
```python
98
# Use weights with appropriate ordering for best performance
99
high_freq_words = ['the', 'and', 'or', 'but']
100
low_freq_words = ['sesquipedalian', 'antidisestablishmentarianism']
101
all_words = high_freq_words + low_freq_words
102
103
# Assign higher weights to frequently accessed words
104
weights = [100] * len(high_freq_words) + [1] * len(low_freq_words)
105
106
optimized_trie = marisa_trie.Trie(
107
all_words,
108
weights=weights,
109
order=marisa_trie.WEIGHT_ORDER, # Essential for weight optimization
110
cache_size=marisa_trie.LARGE_CACHE
111
)
112
113
# Verify that high-frequency words get better performance
114
print(f"'the' lookup speed optimized: {optimized_trie.key_id('the')}")
115
```
116
117
### Custom Configuration Validation
118
119
```python
120
def create_optimized_trie(keys, performance_level='balanced'):
121
"""Create trie with performance-appropriate settings."""
122
123
configs = {
124
'minimal': {
125
'cache_size': marisa_trie.TINY_CACHE,
126
'num_tries': marisa_trie.MIN_NUM_TRIES,
127
'binary': True
128
},
129
'balanced': {
130
'cache_size': marisa_trie.DEFAULT_CACHE,
131
'num_tries': marisa_trie.DEFAULT_NUM_TRIES,
132
'order': marisa_trie.DEFAULT_ORDER
133
},
134
'maximum': {
135
'cache_size': marisa_trie.HUGE_CACHE,
136
'num_tries': marisa_trie.MAX_NUM_TRIES,
137
'order': marisa_trie.WEIGHT_ORDER
138
}
139
}
140
141
config = configs.get(performance_level, configs['balanced'])
142
return marisa_trie.Trie(keys, **config)
143
144
# Usage
145
words = ['apple', 'banana', 'cherry']
146
fast_trie = create_optimized_trie(words, 'maximum')
147
compact_trie = create_optimized_trie(words, 'minimal')
148
```
149
150
### Configuration Impact Examples
151
152
```python
153
import time
154
import marisa_trie
155
156
large_keys = [f"key_{i:06d}" for i in range(10000)]
157
158
# Measure performance with different cache sizes
159
def benchmark_cache_size(keys, cache_size, name):
160
start = time.time()
161
trie = marisa_trie.Trie(keys, cache_size=cache_size)
162
build_time = time.time() - start
163
164
start = time.time()
165
for i in range(1000):
166
_ = trie.key_id(keys[i % len(keys)])
167
lookup_time = time.time() - start
168
169
print(f"{name}: Build={build_time:.3f}s, Lookup={lookup_time:.3f}s")
170
171
# Compare cache configurations
172
benchmark_cache_size(large_keys, marisa_trie.TINY_CACHE, "Tiny Cache")
173
benchmark_cache_size(large_keys, marisa_trie.DEFAULT_CACHE, "Default Cache")
174
benchmark_cache_size(large_keys, marisa_trie.HUGE_CACHE, "Huge Cache")
175
```
176
177
### Error Handling for Configuration
178
179
```python
180
try:
181
# Invalid num_tries value
182
invalid_trie = marisa_trie.Trie(
183
['test'],
184
num_tries=999 # Exceeds MAX_NUM_TRIES
185
)
186
except ValueError as e:
187
print(f"Configuration error: {e}")
188
189
# Check valid ranges
190
print(f"Valid trie count range: {marisa_trie.MIN_NUM_TRIES} to {marisa_trie.MAX_NUM_TRIES}")
191
print(f"Default configuration: {marisa_trie.DEFAULT_NUM_TRIES} tries")
192
193
# Validate configuration before use
194
def validate_trie_config(num_tries):
195
if not (marisa_trie.MIN_NUM_TRIES <= num_tries <= marisa_trie.MAX_NUM_TRIES):
196
raise ValueError(
197
f"num_tries must be between {marisa_trie.MIN_NUM_TRIES} "
198
f"and {marisa_trie.MAX_NUM_TRIES}, got {num_tries}"
199
)
200
return True
201
202
# Safe configuration
203
try:
204
validate_trie_config(5)
205
safe_trie = marisa_trie.Trie(['safe', 'config'], num_tries=5)
206
print("Configuration validated and trie created successfully")
207
except ValueError as e:
208
print(f"Invalid configuration: {e}")
209
```
210
211
### Advanced Configuration Patterns
212
213
```python
214
# Configuration for different use cases
215
def get_trie_config(use_case):
216
"""Return optimal configuration for specific use cases."""
217
218
configs = {
219
'autocomplete': {
220
'cache_size': marisa_trie.LARGE_CACHE,
221
'order': marisa_trie.LABEL_ORDER, # Predictable prefix order
222
'binary': False
223
},
224
'dictionary_lookup': {
225
'cache_size': marisa_trie.DEFAULT_CACHE,
226
'order': marisa_trie.WEIGHT_ORDER, # Optimize for frequent words
227
'binary': False
228
},
229
'binary_data': {
230
'cache_size': marisa_trie.NORMAL_CACHE,
231
'binary': True, # Handle null bytes properly
232
'order': marisa_trie.DEFAULT_ORDER
233
},
234
'memory_constrained': {
235
'cache_size': marisa_trie.TINY_CACHE,
236
'num_tries': marisa_trie.MIN_NUM_TRIES,
237
'binary': True # Better compression
238
}
239
}
240
241
return configs.get(use_case, configs['dictionary_lookup'])
242
243
# Usage examples
244
autocomplete_trie = marisa_trie.Trie(
245
search_suggestions,
246
**get_trie_config('autocomplete')
247
)
248
249
binary_lookup = marisa_trie.BinaryTrie(
250
binary_patterns,
251
**get_trie_config('binary_data')
252
)
253
```