Tessl Tile for pypi/fugashi@1.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

dictionary-management.md index.md nodes-features.md tokenization.md

dictionary-management.mddocs/

0
# Dictionary Management
1

2
Dictionary configuration, information access, and custom dictionary building that enables advanced MeCab dictionary management and extensive customization for specific use cases and research applications.
3

4
## Capabilities
5

6
### Dictionary Information Access
7

8
Access detailed information about loaded MeCab dictionaries including metadata, file paths, and encoding information.
9

10
```python { .api }
11
class Tagger:
12
    @property
13
    def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
14
        """Get information about loaded dictionaries.
15
        
16
        Returns:
17
            List of dictionaries with keys:
18
            - 'filename': Path to dictionary file
19
            - 'charset': Character encoding used
20
            - 'size': Dictionary size in entries
21
            - 'version': Dictionary version (may not be reliable)
22
        """
23
        ...
24

25
class GenericTagger:
26
    @property  
27
    def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
28
        """Get information about loaded dictionaries.
29
        
30
        Returns:
31
            List of dictionaries with keys:
32
            - 'filename': Path to dictionary file
33
            - 'charset': Character encoding used
34
            - 'size': Dictionary size in entries
35
            - 'version': Dictionary version (may not be reliable)
36
        """
37
        ...
38
```
39

40
### Feature Wrapper Creation
41

42
Create custom named tuple wrappers for dictionary features to enable structured access to morphological data from any MeCab dictionary format.
43

44
```python { .api }
45
def create_feature_wrapper(name: str, fields: List[str], default: Any = None) -> NamedTuple:
46
    """Create a namedtuple-based wrapper for dictionary features.
47
    
48
    Sets default values to None for handling unknown words that may have
49
    fewer fields than the full schema.
50
    
51
    Args:
52
        name: Name for the resulting namedtuple class
53
        fields: List of field names for the features
54
        default: Default value for missing fields (default: None)
55
        
56
    Returns:
57
        Named tuple class that can be used as wrapper for GenericTagger
58
    """
59
    ...
60
```
61

62
### Dictionary Discovery
63

64
Discover and import installed UniDic packages for automatic dictionary configuration.
65

66
```python { .api }
67
def try_import_unidic() -> Optional[str]:
68
    """Import unidic or unidic-lite packages if available.
69
    
70
    Attempts to import unidic first, then unidic-lite as fallback.
71
    Used internally by Tagger for automatic dictionary discovery.
72
    
73
    Returns:
74
        Dictionary directory path if found, None if no UniDic package available
75
    """
76
    ...
77
```
78

79
### Dictionary Building
80

81
Build custom user dictionaries from CSV input using MeCab's dictionary compilation functionality.
82

83
```python { .api }
84
def build_dictionary(args: str) -> None:
85
    """Build user dictionary using MeCab's dictionary building functionality.
86
    
87
    Wraps MeCab's mecab-dict-index command for compiling custom dictionaries
88
    from formatted CSV input files.
89
    
90
    Args:
91
        args: Command line arguments for dictionary building
92
              (e.g., "-f utf8 -t utf8 input.csv output_dir")
93
              
94
    Raises:
95
        RuntimeError: If dictionary building fails
96
    """
97
    ...
98
```
99

100
### Utility Functions
101

102
Helper functions for creating flexible tagger interfaces.
103

104
```python { .api }
105
def make_tuple(*args) -> tuple:
106
    """Create tuple from variable arguments.
107
    
108
    Wrapper function that provides the same interface as namedtuple
109
    constructors for use as a feature wrapper in GenericTagger.
110
    
111
    Args:
112
        *args: Variable number of arguments
113
        
114
    Returns:
115
        Tuple containing all provided arguments
116
    """
117
    ...
118
```
119

120
## Usage Examples
121

122
### Dictionary Information Inspection
123

124
```python
125
from fugashi import Tagger
126

127
tagger = Tagger()
128

129
# Get information about loaded dictionaries
130
for i, dict_info in enumerate(tagger.dictionary_info):
131
    print(f"Dictionary {i+1}:")
132
    print(f"  Filename: {dict_info['filename']}")
133
    print(f"  Charset: {dict_info['charset']}")
134
    print(f"  Size: {dict_info['size']:,} entries")
135
    print(f"  Version: {dict_info['version']}")
136
    print()
137
```
138

139
### Custom Feature Wrapper
140

141
```python
142
from fugashi import GenericTagger, create_feature_wrapper
143

144
# Create custom feature wrapper for IPA dictionary
145
IpaFeatures = create_feature_wrapper('IpaFeatures', [
146
    'pos1', 'pos2', 'pos3', 'pos4',
147
    'inflection_type', 'inflection_form', 
148
    'base_form', 'reading', 'pronunciation'
149
])
150

151
# Use with IPA dictionary
152
tagger = GenericTagger(wrapper=IpaFeatures)
153

154
text = "走っています"
155
nodes = tagger(text)
156

157
for node in nodes:
158
    print(f"Surface: {node.surface}")
159
    print(f"POS: {node.feature.pos1}")
160
    print(f"Base form: {node.feature.base_form}")
161
    print(f"Reading: {node.feature.reading}")
162
    print("---")
163
```
164

165
### Working with Different Dictionary Types
166

167
```python
168
from fugashi import GenericTagger, Tagger
169

170
# Default Tagger (UniDic with auto-detection)
171
unidic_tagger = Tagger()
172

173
# Generic tagger with tuple features
174
generic_tagger = GenericTagger()
175

176
# Generic tagger with specific dictionary path
177
custom_tagger = GenericTagger('-d /path/to/custom/dictionary')
178

179
text = "辞書を比較する"
180

181
print("UniDic features:")
182
nodes = unidic_tagger(text)
183
for node in nodes:
184
    print(f"{node.surface}: {node.feature.lemma}")
185

186
print("\nGeneric tuple features:")
187
nodes = generic_tagger(text)
188
for node in nodes:
189
    print(f"{node.surface}: {node.feature[6]}")  # Base form at index 6
190
```
191

192
### Dictionary Discovery and Setup
193

194
```python
195
from fugashi import try_import_unidic, Tagger
196

197
# Check for UniDic installation
198
unidic_path = try_import_unidic()
199
if unidic_path:
200
    print(f"UniDic found at: {unidic_path}")
201
    
202
    # Tagger will automatically use this
203
    tagger = Tagger()
204
    print("Tagger initialized with auto-discovered UniDic")
205
else:
206
    print("No UniDic package found")
207
    print("Install with: pip install 'fugashi[unidic-lite]'")
208
```
209

210
### Building Custom Dictionary
211

212
```python
213
from fugashi import build_dictionary
214
import os
215

216
# Prepare CSV data for custom dictionary
217
csv_content = """surface,left_context,right_context,cost,pos1,pos2,pos3,pos4,inflection,conjugation,base,reading,pronunciation
218
専門用語,1,1,5000,名詞,一般,*,*,*,*,専門用語,センモンヨウゴ,センモンヨーゴ
219
固有名詞,1,1,3000,名詞,固有名詞,*,*,*,*,固有名詞,コユウメイシ,コユーメーシ
220
"""
221

222
# Write CSV file
223
with open('custom_dict.csv', 'w', encoding='utf-8') as f:
224
    f.write(csv_content)
225

226
try:
227
    # Build dictionary
228
    build_dictionary('-f utf8 -t utf8 custom_dict.csv custom_dict_dir')
229
    print("Custom dictionary built successfully")
230
    
231
    # Use custom dictionary
232
    from fugashi import GenericTagger
233
    tagger = GenericTagger(f'-d {os.path.abspath("custom_dict_dir")}')
234
    
235
    result = tagger.parse("専門用語の解析")
236
    print(f"Result: {result}")
237
    
238
except Exception as e:
239
    print(f"Dictionary building failed: {e}")
240
finally:
241
    # Cleanup
242
    if os.path.exists('custom_dict.csv'):
243
        os.remove('custom_dict.csv')
244
```
245

246
### Advanced Dictionary Configuration
247

248
```python
249
from fugashi import GenericTagger
250

251
# Multiple dictionaries (system + user)
252
args = '-d /path/to/system/dict -u /path/to/user/dict1 -u /path/to/user/dict2'
253
tagger = GenericTagger(args)
254

255
# Different output formats
256
wakati_tagger = GenericTagger('-Owakati')  # Space-separated tokens
257
yomi_tagger = GenericTagger('-Oyomi')      # Reading only
258
node_tagger = GenericTagger('-Onode')      # Node format
259

260
text = "複数の辞書設定"
261

262
print("Wakati:", wakati_tagger.parse(text))
263
print("Yomi:", yomi_tagger.parse(text))
264
print("Node:", node_tagger.parse(text))
265

266
# Check what dictionaries are loaded
267
for i, dict_info in enumerate(tagger.dictionary_info):
268
    dict_type = "System" if i == 0 else f"User {i}"
269
    print(f"{dict_type} dictionary: {dict_info['filename']}")
270
```
271

272
### Feature Wrapper for Unknown Words
273

274
```python
275
from fugashi import GenericTagger, create_feature_wrapper
276

277
# Create wrapper that handles variable field counts
278
FlexibleFeatures = create_feature_wrapper('FlexibleFeatures', [
279
    'pos1', 'pos2', 'pos3', 'pos4', 'pos5', 'pos6', 
280
    'base_form', 'reading', 'pronunciation'
281
], default='*')  # Use '*' as default instead of None
282

283
tagger = GenericTagger(wrapper=FlexibleFeatures)
284

285
text = "日本語とEnglishのmixed文章"
286
nodes = tagger(text)
287

288
for node in nodes:
289
    print(f"Surface: {node.surface}")
290
    print(f"POS1: {node.feature.pos1}")
291
    print(f"Base: {node.feature.base_form}")
292
    print(f"Unknown: {node.is_unk}")
293
    print("---")
294
```

Version

Tile

Files

dictionary-management.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

dictionary-management.mddocs/