0
# Dictionary Management
1
2
Dictionary configuration, information access, and custom dictionary building that enables advanced MeCab dictionary management and extensive customization for specific use cases and research applications.
3
4
## Capabilities
5
6
### Dictionary Information Access
7
8
Access detailed information about loaded MeCab dictionaries including metadata, file paths, and encoding information.
9
10
```python { .api }
11
class Tagger:
12
@property
13
def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
14
"""Get information about loaded dictionaries.
15
16
Returns:
17
List of dictionaries with keys:
18
- 'filename': Path to dictionary file
19
- 'charset': Character encoding used
20
- 'size': Dictionary size in entries
21
- 'version': Dictionary version (may not be reliable)
22
"""
23
...
24
25
class GenericTagger:
26
@property
27
def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
28
"""Get information about loaded dictionaries.
29
30
Returns:
31
List of dictionaries with keys:
32
- 'filename': Path to dictionary file
33
- 'charset': Character encoding used
34
- 'size': Dictionary size in entries
35
- 'version': Dictionary version (may not be reliable)
36
"""
37
...
38
```
39
40
### Feature Wrapper Creation
41
42
Create custom named tuple wrappers for dictionary features to enable structured access to morphological data from any MeCab dictionary format.
43
44
```python { .api }
45
def create_feature_wrapper(name: str, fields: List[str], default: Any = None) -> NamedTuple:
46
"""Create a namedtuple-based wrapper for dictionary features.
47
48
Sets default values to None for handling unknown words that may have
49
fewer fields than the full schema.
50
51
Args:
52
name: Name for the resulting namedtuple class
53
fields: List of field names for the features
54
default: Default value for missing fields (default: None)
55
56
Returns:
57
Named tuple class that can be used as wrapper for GenericTagger
58
"""
59
...
60
```
61
62
### Dictionary Discovery
63
64
Discover and import installed UniDic packages for automatic dictionary configuration.
65
66
```python { .api }
67
def try_import_unidic() -> Optional[str]:
68
"""Import unidic or unidic-lite packages if available.
69
70
Attempts to import unidic first, then unidic-lite as fallback.
71
Used internally by Tagger for automatic dictionary discovery.
72
73
Returns:
74
Dictionary directory path if found, None if no UniDic package available
75
"""
76
...
77
```
78
79
### Dictionary Building
80
81
Build custom user dictionaries from CSV input using MeCab's dictionary compilation functionality.
82
83
```python { .api }
84
def build_dictionary(args: str) -> None:
85
"""Build user dictionary using MeCab's dictionary building functionality.
86
87
Wraps MeCab's mecab-dict-index command for compiling custom dictionaries
88
from formatted CSV input files.
89
90
Args:
91
args: Command line arguments for dictionary building
92
(e.g., "-f utf8 -t utf8 input.csv output_dir")
93
94
Raises:
95
RuntimeError: If dictionary building fails
96
"""
97
...
98
```
99
100
### Utility Functions
101
102
Helper functions for creating flexible tagger interfaces.
103
104
```python { .api }
105
def make_tuple(*args) -> tuple:
106
"""Create tuple from variable arguments.
107
108
Wrapper function that provides the same interface as namedtuple
109
constructors for use as a feature wrapper in GenericTagger.
110
111
Args:
112
*args: Variable number of arguments
113
114
Returns:
115
Tuple containing all provided arguments
116
"""
117
...
118
```
119
120
## Usage Examples
121
122
### Dictionary Information Inspection
123
124
```python
125
from fugashi import Tagger
126
127
tagger = Tagger()
128
129
# Get information about loaded dictionaries
130
for i, dict_info in enumerate(tagger.dictionary_info):
131
print(f"Dictionary {i+1}:")
132
print(f" Filename: {dict_info['filename']}")
133
print(f" Charset: {dict_info['charset']}")
134
print(f" Size: {dict_info['size']:,} entries")
135
print(f" Version: {dict_info['version']}")
136
print()
137
```
138
139
### Custom Feature Wrapper
140
141
```python
142
from fugashi import GenericTagger, create_feature_wrapper
143
144
# Create custom feature wrapper for IPA dictionary
145
IpaFeatures = create_feature_wrapper('IpaFeatures', [
146
'pos1', 'pos2', 'pos3', 'pos4',
147
'inflection_type', 'inflection_form',
148
'base_form', 'reading', 'pronunciation'
149
])
150
151
# Use with IPA dictionary
152
tagger = GenericTagger(wrapper=IpaFeatures)
153
154
text = "走っています"
155
nodes = tagger(text)
156
157
for node in nodes:
158
print(f"Surface: {node.surface}")
159
print(f"POS: {node.feature.pos1}")
160
print(f"Base form: {node.feature.base_form}")
161
print(f"Reading: {node.feature.reading}")
162
print("---")
163
```
164
165
### Working with Different Dictionary Types
166
167
```python
168
from fugashi import GenericTagger, Tagger
169
170
# Default Tagger (UniDic with auto-detection)
171
unidic_tagger = Tagger()
172
173
# Generic tagger with tuple features
174
generic_tagger = GenericTagger()
175
176
# Generic tagger with specific dictionary path
177
custom_tagger = GenericTagger('-d /path/to/custom/dictionary')
178
179
text = "辞書を比較する"
180
181
print("UniDic features:")
182
nodes = unidic_tagger(text)
183
for node in nodes:
184
print(f"{node.surface}: {node.feature.lemma}")
185
186
print("\nGeneric tuple features:")
187
nodes = generic_tagger(text)
188
for node in nodes:
189
print(f"{node.surface}: {node.feature[6]}") # Base form at index 6
190
```
191
192
### Dictionary Discovery and Setup
193
194
```python
195
from fugashi import try_import_unidic, Tagger
196
197
# Check for UniDic installation
198
unidic_path = try_import_unidic()
199
if unidic_path:
200
print(f"UniDic found at: {unidic_path}")
201
202
# Tagger will automatically use this
203
tagger = Tagger()
204
print("Tagger initialized with auto-discovered UniDic")
205
else:
206
print("No UniDic package found")
207
print("Install with: pip install 'fugashi[unidic-lite]'")
208
```
209
210
### Building Custom Dictionary
211
212
```python
213
from fugashi import build_dictionary
214
import os
215
216
# Prepare CSV data for custom dictionary
217
csv_content = """surface,left_context,right_context,cost,pos1,pos2,pos3,pos4,inflection,conjugation,base,reading,pronunciation
218
専門用語,1,1,5000,名詞,一般,*,*,*,*,専門用語,センモンヨウゴ,センモンヨーゴ
219
固有名詞,1,1,3000,名詞,固有名詞,*,*,*,*,固有名詞,コユウメイシ,コユーメーシ
220
"""
221
222
# Write CSV file
223
with open('custom_dict.csv', 'w', encoding='utf-8') as f:
224
f.write(csv_content)
225
226
try:
227
# Build dictionary
228
build_dictionary('-f utf8 -t utf8 custom_dict.csv custom_dict_dir')
229
print("Custom dictionary built successfully")
230
231
# Use custom dictionary
232
from fugashi import GenericTagger
233
tagger = GenericTagger(f'-d {os.path.abspath("custom_dict_dir")}')
234
235
result = tagger.parse("専門用語の解析")
236
print(f"Result: {result}")
237
238
except Exception as e:
239
print(f"Dictionary building failed: {e}")
240
finally:
241
# Cleanup
242
if os.path.exists('custom_dict.csv'):
243
os.remove('custom_dict.csv')
244
```
245
246
### Advanced Dictionary Configuration
247
248
```python
249
from fugashi import GenericTagger
250
251
# Multiple dictionaries (system + user)
252
args = '-d /path/to/system/dict -u /path/to/user/dict1 -u /path/to/user/dict2'
253
tagger = GenericTagger(args)
254
255
# Different output formats
256
wakati_tagger = GenericTagger('-Owakati') # Space-separated tokens
257
yomi_tagger = GenericTagger('-Oyomi') # Reading only
258
node_tagger = GenericTagger('-Onode') # Node format
259
260
text = "複数の辞書設定"
261
262
print("Wakati:", wakati_tagger.parse(text))
263
print("Yomi:", yomi_tagger.parse(text))
264
print("Node:", node_tagger.parse(text))
265
266
# Check what dictionaries are loaded
267
for i, dict_info in enumerate(tagger.dictionary_info):
268
dict_type = "System" if i == 0 else f"User {i}"
269
print(f"{dict_type} dictionary: {dict_info['filename']}")
270
```
271
272
### Feature Wrapper for Unknown Words
273
274
```python
275
from fugashi import GenericTagger, create_feature_wrapper
276
277
# Create wrapper that handles variable field counts
278
FlexibleFeatures = create_feature_wrapper('FlexibleFeatures', [
279
'pos1', 'pos2', 'pos3', 'pos4', 'pos5', 'pos6',
280
'base_form', 'reading', 'pronunciation'
281
], default='*') # Use '*' as default instead of None
282
283
tagger = GenericTagger(wrapper=FlexibleFeatures)
284
285
text = "日本語とEnglishのmixed文章"
286
nodes = tagger(text)
287
288
for node in nodes:
289
print(f"Surface: {node.surface}")
290
print(f"POS1: {node.feature.pos1}")
291
print(f"Base: {node.feature.base_form}")
292
print(f"Unknown: {node.is_unk}")
293
print("---")
294
```