or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

dictionary-management.mdindex.mdnodes-features.mdtokenization.md

tokenization.mddocs/

0

# Tokenization

1

2

Core tokenization functionality that provides Japanese text segmentation and parsing through MeCab. These classes and methods form the foundation of fugashi's text processing capabilities.

3

4

## Capabilities

5

6

### Tagger Class

7

8

The main tagger class with automatic UniDic support and feature format detection. Recommended for most use cases involving UniDic dictionaries.

9

10

```python { .api }

11

class Tagger:

12

def __init__(self, arg: str = '') -> None:

13

"""Initialize Tagger with UniDic support and automatic feature format detection.

14

15

Args:

16

arg: MeCab arguments string (e.g., '-Owakati' for wakati mode)

17

18

Raises:

19

RuntimeError: If MeCab initialization fails

20

"""

21

...

22

23

def __call__(self, text: str) -> List[UnidicNode]:

24

"""Parse text and return list of UnidicNode objects. Alias for parseToNodeList.

25

26

Args:

27

text: Input Japanese text to tokenize

28

29

Returns:

30

List of UnidicNode objects representing tokens

31

"""

32

...

33

34

def parse(self, text: str) -> str:

35

"""Parse text and return formatted string output.

36

37

Args:

38

text: Input Japanese text to tokenize

39

40

Returns:

41

Formatted string with token information (format depends on MeCab options)

42

"""

43

...

44

45

def parseToNodeList(self, text: str) -> List[UnidicNode]:

46

"""Parse text and return list of UnidicNode objects.

47

48

Args:

49

text: Input Japanese text to tokenize

50

51

Returns:

52

List of UnidicNode objects with surface forms and features

53

"""

54

...

55

56

def nbest(self, text: str, num: int = 10) -> str:

57

"""Return n-best tokenization candidates as formatted string.

58

59

Args:

60

text: Input Japanese text to tokenize

61

num: Number of best candidates to return (default: 10)

62

63

Returns:

64

Formatted string with multiple tokenization options

65

"""

66

...

67

68

def nbestToNodeList(self, text: str, num: int = 10) -> List[List[UnidicNode]]:

69

"""Return n-best tokenization candidates as lists of nodes.

70

71

Args:

72

text: Input Japanese text to tokenize

73

num: Number of best candidates to return (default: 10)

74

75

Returns:

76

List of tokenization alternatives, each as a list of UnidicNode objects

77

"""

78

...

79

80

@property

81

def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:

82

"""Get information about loaded dictionaries.

83

84

Returns:

85

List of dictionaries containing filename, charset, size, and version info

86

"""

87

...

88

```

89

90

### GenericTagger Class

91

92

Generic tagger supporting any MeCab dictionary with customizable feature wrappers. Use when working with non-UniDic dictionaries or when custom feature handling is needed.

93

94

```python { .api }

95

class GenericTagger:

96

def __init__(self, args: str = '', wrapper: Callable = make_tuple, quiet: bool = False) -> None:

97

"""Initialize GenericTagger with custom dictionary and feature wrapper.

98

99

Args:

100

args: MeCab arguments string including dictionary specification

101

wrapper: Feature wrapper function (default: make_tuple)

102

quiet: Suppress error details on initialization failure (default: False)

103

104

Raises:

105

RuntimeError: If MeCab initialization fails

106

"""

107

...

108

109

def __call__(self, text: str) -> List[Node]:

110

"""Parse text and return list of Node objects. Alias for parseToNodeList.

111

112

Args:

113

text: Input Japanese text to tokenize

114

115

Returns:

116

List of Node objects representing tokens

117

"""

118

...

119

120

def parse(self, text: str) -> str:

121

"""Parse text and return formatted string output.

122

123

Args:

124

text: Input Japanese text to tokenize

125

126

Returns:

127

Formatted string with token information (format depends on MeCab options)

128

"""

129

...

130

131

def parseToNodeList(self, text: str) -> List[Node]:

132

"""Parse text and return list of Node objects.

133

134

Args:

135

text: Input Japanese text to tokenize

136

137

Returns:

138

List of Node objects with surface forms and features

139

"""

140

...

141

142

def nbest(self, text: str, num: int = 10) -> str:

143

"""Return n-best tokenization candidates as formatted string.

144

145

Args:

146

text: Input Japanese text to tokenize

147

num: Number of best candidates to return (default: 10)

148

149

Returns:

150

Formatted string with multiple tokenization options

151

"""

152

...

153

154

def nbestToNodeList(self, text: str, num: int = 10) -> List[List[Node]]:

155

"""Return n-best tokenization candidates as lists of nodes.

156

157

Args:

158

text: Input Japanese text to tokenize

159

num: Number of best candidates to return (default: 10)

160

161

Returns:

162

List of tokenization alternatives, each as a list of Node objects

163

"""

164

...

165

166

@property

167

def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:

168

"""Get information about loaded dictionaries.

169

170

Returns:

171

List of dictionaries containing filename, charset, size, and version info

172

"""

173

...

174

```

175

176

## Usage Examples

177

178

### Basic Tokenization

179

180

```python

181

from fugashi import Tagger

182

183

# Initialize with default UniDic

184

tagger = Tagger()

185

186

# Parse Japanese text

187

text = "私は学校に行きます。"

188

nodes = tagger(text)

189

190

for node in nodes:

191

print(f"{node.surface}\t{node.feature.lemma}\t{node.pos}")

192

```

193

194

### Wakati Mode (Word Segmentation)

195

196

```python

197

from fugashi import Tagger

198

199

# Initialize in wakati mode

200

tagger = Tagger('-Owakati')

201

202

text = "私は学校に行きます。"

203

result = tagger.parse(text)

204

print(result) # "私 は 学校 に 行き ます 。"

205

```

206

207

### N-Best Parsing

208

209

```python

210

from fugashi import Tagger

211

212

tagger = Tagger()

213

text = "外国人参政権"

214

215

# Get multiple tokenization candidates

216

candidates = tagger.nbestToNodeList(text, 3)

217

218

for i, candidate in enumerate(candidates):

219

tokens = [node.surface for node in candidate]

220

print(f"Candidate {i+1}: {' '.join(tokens)}")

221

```

222

223

### Generic Dictionary Usage

224

225

```python

226

from fugashi import GenericTagger

227

228

# Using with IPA dictionary

229

tagger = GenericTagger()

230

231

text = "今日は良い天気です。"

232

nodes = tagger(text)

233

234

for node in nodes:

235

# Access features by index (varies by dictionary)

236

print(f"{node.surface}\t{node.feature[0]}\t{node.feature[1]}")

237

```

238

239

### Custom Feature Wrapper

240

241

```python

242

from fugashi import GenericTagger, create_feature_wrapper

243

244

# Create custom feature wrapper

245

CustomFeatures = create_feature_wrapper('CustomFeatures',

246

['pos1', 'pos2', 'pos3', 'pos4', 'inflection', 'conjugation', 'base_form'])

247

248

# Use with generic tagger

249

tagger = GenericTagger(wrapper=CustomFeatures)

250

251

text = "走っている"

252

nodes = tagger(text)

253

254

for node in nodes:

255

print(f"Surface: {node.surface}")

256

print(f"POS: {node.feature.pos1}")

257

print(f"Base form: {node.feature.base_form}")

258

```