or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

dictionary-management.mdindex.mdnodes-features.mdtokenization.md

dictionary-management.mddocs/

0

# Dictionary Management

1

2

Dictionary configuration, information access, and custom dictionary building that enables advanced MeCab dictionary management and extensive customization for specific use cases and research applications.

3

4

## Capabilities

5

6

### Dictionary Information Access

7

8

Access detailed information about loaded MeCab dictionaries including metadata, file paths, and encoding information.

9

10

```python { .api }

11

class Tagger:

12

@property

13

def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:

14

"""Get information about loaded dictionaries.

15

16

Returns:

17

List of dictionaries with keys:

18

- 'filename': Path to dictionary file

19

- 'charset': Character encoding used

20

- 'size': Dictionary size in entries

21

- 'version': Dictionary version (may not be reliable)

22

"""

23

...

24

25

class GenericTagger:

26

@property

27

def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:

28

"""Get information about loaded dictionaries.

29

30

Returns:

31

List of dictionaries with keys:

32

- 'filename': Path to dictionary file

33

- 'charset': Character encoding used

34

- 'size': Dictionary size in entries

35

- 'version': Dictionary version (may not be reliable)

36

"""

37

...

38

```

39

40

### Feature Wrapper Creation

41

42

Create custom named tuple wrappers for dictionary features to enable structured access to morphological data from any MeCab dictionary format.

43

44

```python { .api }

45

def create_feature_wrapper(name: str, fields: List[str], default: Any = None) -> NamedTuple:

46

"""Create a namedtuple-based wrapper for dictionary features.

47

48

Sets default values to None for handling unknown words that may have

49

fewer fields than the full schema.

50

51

Args:

52

name: Name for the resulting namedtuple class

53

fields: List of field names for the features

54

default: Default value for missing fields (default: None)

55

56

Returns:

57

Named tuple class that can be used as wrapper for GenericTagger

58

"""

59

...

60

```

61

62

### Dictionary Discovery

63

64

Discover and import installed UniDic packages for automatic dictionary configuration.

65

66

```python { .api }

67

def try_import_unidic() -> Optional[str]:

68

"""Import unidic or unidic-lite packages if available.

69

70

Attempts to import unidic first, then unidic-lite as fallback.

71

Used internally by Tagger for automatic dictionary discovery.

72

73

Returns:

74

Dictionary directory path if found, None if no UniDic package available

75

"""

76

...

77

```

78

79

### Dictionary Building

80

81

Build custom user dictionaries from CSV input using MeCab's dictionary compilation functionality.

82

83

```python { .api }

84

def build_dictionary(args: str) -> None:

85

"""Build user dictionary using MeCab's dictionary building functionality.

86

87

Wraps MeCab's mecab-dict-index command for compiling custom dictionaries

88

from formatted CSV input files.

89

90

Args:

91

args: Command line arguments for dictionary building

92

(e.g., "-f utf8 -t utf8 input.csv output_dir")

93

94

Raises:

95

RuntimeError: If dictionary building fails

96

"""

97

...

98

```

99

100

### Utility Functions

101

102

Helper functions for creating flexible tagger interfaces.

103

104

```python { .api }

105

def make_tuple(*args) -> tuple:

106

"""Create tuple from variable arguments.

107

108

Wrapper function that provides the same interface as namedtuple

109

constructors for use as a feature wrapper in GenericTagger.

110

111

Args:

112

*args: Variable number of arguments

113

114

Returns:

115

Tuple containing all provided arguments

116

"""

117

...

118

```

119

120

## Usage Examples

121

122

### Dictionary Information Inspection

123

124

```python

125

from fugashi import Tagger

126

127

tagger = Tagger()

128

129

# Get information about loaded dictionaries

130

for i, dict_info in enumerate(tagger.dictionary_info):

131

print(f"Dictionary {i+1}:")

132

print(f" Filename: {dict_info['filename']}")

133

print(f" Charset: {dict_info['charset']}")

134

print(f" Size: {dict_info['size']:,} entries")

135

print(f" Version: {dict_info['version']}")

136

print()

137

```

138

139

### Custom Feature Wrapper

140

141

```python

142

from fugashi import GenericTagger, create_feature_wrapper

143

144

# Create custom feature wrapper for IPA dictionary

145

IpaFeatures = create_feature_wrapper('IpaFeatures', [

146

'pos1', 'pos2', 'pos3', 'pos4',

147

'inflection_type', 'inflection_form',

148

'base_form', 'reading', 'pronunciation'

149

])

150

151

# Use with IPA dictionary

152

tagger = GenericTagger(wrapper=IpaFeatures)

153

154

text = "走っています"

155

nodes = tagger(text)

156

157

for node in nodes:

158

print(f"Surface: {node.surface}")

159

print(f"POS: {node.feature.pos1}")

160

print(f"Base form: {node.feature.base_form}")

161

print(f"Reading: {node.feature.reading}")

162

print("---")

163

```

164

165

### Working with Different Dictionary Types

166

167

```python

168

from fugashi import GenericTagger, Tagger

169

170

# Default Tagger (UniDic with auto-detection)

171

unidic_tagger = Tagger()

172

173

# Generic tagger with tuple features

174

generic_tagger = GenericTagger()

175

176

# Generic tagger with specific dictionary path

177

custom_tagger = GenericTagger('-d /path/to/custom/dictionary')

178

179

text = "辞書を比較する"

180

181

print("UniDic features:")

182

nodes = unidic_tagger(text)

183

for node in nodes:

184

print(f"{node.surface}: {node.feature.lemma}")

185

186

print("\nGeneric tuple features:")

187

nodes = generic_tagger(text)

188

for node in nodes:

189

print(f"{node.surface}: {node.feature[6]}") # Base form at index 6

190

```

191

192

### Dictionary Discovery and Setup

193

194

```python

195

from fugashi import try_import_unidic, Tagger

196

197

# Check for UniDic installation

198

unidic_path = try_import_unidic()

199

if unidic_path:

200

print(f"UniDic found at: {unidic_path}")

201

202

# Tagger will automatically use this

203

tagger = Tagger()

204

print("Tagger initialized with auto-discovered UniDic")

205

else:

206

print("No UniDic package found")

207

print("Install with: pip install 'fugashi[unidic-lite]'")

208

```

209

210

### Building Custom Dictionary

211

212

```python

213

from fugashi import build_dictionary

214

import os

215

216

# Prepare CSV data for custom dictionary

217

csv_content = """surface,left_context,right_context,cost,pos1,pos2,pos3,pos4,inflection,conjugation,base,reading,pronunciation

218

専門用語,1,1,5000,名詞,一般,*,*,*,*,専門用語,センモンヨウゴ,センモンヨーゴ

219

固有名詞,1,1,3000,名詞,固有名詞,*,*,*,*,固有名詞,コユウメイシ,コユーメーシ

220

"""

221

222

# Write CSV file

223

with open('custom_dict.csv', 'w', encoding='utf-8') as f:

224

f.write(csv_content)

225

226

try:

227

# Build dictionary

228

build_dictionary('-f utf8 -t utf8 custom_dict.csv custom_dict_dir')

229

print("Custom dictionary built successfully")

230

231

# Use custom dictionary

232

from fugashi import GenericTagger

233

tagger = GenericTagger(f'-d {os.path.abspath("custom_dict_dir")}')

234

235

result = tagger.parse("専門用語の解析")

236

print(f"Result: {result}")

237

238

except Exception as e:

239

print(f"Dictionary building failed: {e}")

240

finally:

241

# Cleanup

242

if os.path.exists('custom_dict.csv'):

243

os.remove('custom_dict.csv')

244

```

245

246

### Advanced Dictionary Configuration

247

248

```python

249

from fugashi import GenericTagger

250

251

# Multiple dictionaries (system + user)

252

args = '-d /path/to/system/dict -u /path/to/user/dict1 -u /path/to/user/dict2'

253

tagger = GenericTagger(args)

254

255

# Different output formats

256

wakati_tagger = GenericTagger('-Owakati') # Space-separated tokens

257

yomi_tagger = GenericTagger('-Oyomi') # Reading only

258

node_tagger = GenericTagger('-Onode') # Node format

259

260

text = "複数の辞書設定"

261

262

print("Wakati:", wakati_tagger.parse(text))

263

print("Yomi:", yomi_tagger.parse(text))

264

print("Node:", node_tagger.parse(text))

265

266

# Check what dictionaries are loaded

267

for i, dict_info in enumerate(tagger.dictionary_info):

268

dict_type = "System" if i == 0 else f"User {i}"

269

print(f"{dict_type} dictionary: {dict_info['filename']}")

270

```

271

272

### Feature Wrapper for Unknown Words

273

274

```python

275

from fugashi import GenericTagger, create_feature_wrapper

276

277

# Create wrapper that handles variable field counts

278

FlexibleFeatures = create_feature_wrapper('FlexibleFeatures', [

279

'pos1', 'pos2', 'pos3', 'pos4', 'pos5', 'pos6',

280

'base_form', 'reading', 'pronunciation'

281

], default='*') # Use '*' as default instead of None

282

283

tagger = GenericTagger(wrapper=FlexibleFeatures)

284

285

text = "日本語とEnglishのmixed文章"

286

nodes = tagger(text)

287

288

for node in nodes:

289

print(f"Surface: {node.surface}")

290

print(f"POS1: {node.feature.pos1}")

291

print(f"Base: {node.feature.base_form}")

292

print(f"Unknown: {node.is_unk}")

293

print("---")

294

```