or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

dictionary-management.mdindex.mdnodes-features.mdtokenization.md

nodes-features.mddocs/

0

# Nodes and Features

1

2

Token representation and morphological feature access that provides detailed linguistic information for each tokenized element. These classes and structures enable comprehensive analysis of Japanese text morphology.

3

4

## Capabilities

5

6

### Node Class

7

8

Base node class representing a single token from MeCab tokenization with access to surface forms, morphological features, and metadata.

9

10

```python { .api }

11

class Node:

12

@property

13

def surface(self) -> str:

14

"""Surface form of the token (the actual text).

15

16

Returns:

17

The surface string of the token

18

"""

19

...

20

21

@surface.setter

22

def surface(self, value: str) -> None:

23

"""Set the surface form of the token.

24

25

Args:

26

value: New surface form string

27

"""

28

...

29

30

@property

31

def feature(self) -> NamedTuple:

32

"""Parsed feature data from the dictionary as a named tuple.

33

34

Returns:

35

Named tuple containing morphological features (structure depends on dictionary)

36

"""

37

...

38

39

@property

40

def feature_raw(self) -> str:

41

"""Raw feature string from MeCab without parsing.

42

43

Returns:

44

Comma-separated feature string as returned by MeCab

45

"""

46

...

47

48

@property

49

def length(self) -> int:

50

"""Length of the surface form in bytes.

51

52

Returns:

53

Byte length of the token surface

54

"""

55

...

56

57

@property

58

def rlength(self) -> int:

59

"""Total length including trailing whitespace in bytes.

60

61

Returns:

62

Total byte length including whitespace

63

"""

64

...

65

66

@property

67

def posid(self) -> int:

68

"""Part-of-speech ID from MeCab.

69

70

Returns:

71

Numeric POS identifier

72

"""

73

...

74

75

@property

76

def char_type(self) -> int:

77

"""Character type classification from MeCab.

78

79

Returns:

80

Numeric character type code

81

"""

82

...

83

84

@property

85

def stat(self) -> int:

86

"""Node status from MeCab.

87

88

Returns:

89

Status code: 0=normal, 1=unknown, 2=BOS (beginning of sentence), 3=EOS (end of sentence)

90

"""

91

...

92

93

@property

94

def is_unk(self) -> bool:

95

"""Whether the token is unknown to the dictionary.

96

97

Returns:

98

True if the token was not found in the dictionary

99

"""

100

...

101

102

@property

103

def white_space(self) -> str:

104

"""Whitespace characters following this token.

105

106

Returns:

107

String containing trailing whitespace

108

"""

109

...

110

111

@white_space.setter

112

def white_space(self, value: str) -> None:

113

"""Set the whitespace following this token.

114

115

Args:

116

value: Whitespace string

117

"""

118

...

119

120

def __repr__(self) -> str:

121

"""String representation of the node.

122

123

Returns:

124

Surface form or special markers for BOS/EOS nodes

125

"""

126

...

127

```

128

129

### UnidicNode Class

130

131

UniDic-specific node class that extends Node with additional convenience methods for UniDic dictionary features.

132

133

```python { .api }

134

class UnidicNode(Node):

135

@property

136

def pos(self) -> str:

137

"""Four-field part-of-speech value formatted as comma-separated string.

138

139

Returns:

140

POS string in format "pos1,pos2,pos3,pos4"

141

"""

142

...

143

```

144

145

### UniDic Feature Structures

146

147

Named tuple structures providing structured access to UniDic dictionary features across different schema versions.

148

149

```python { .api }

150

UnidicFeatures17 = NamedTuple('UnidicFeatures17', [

151

('pos1', str), # Major part-of-speech category

152

('pos2', str), # Middle part-of-speech category

153

('pos3', str), # Minor part-of-speech category

154

('pos4', str), # Sub part-of-speech category

155

('cType', str), # Conjugation type

156

('cForm', str), # Conjugation form

157

('lForm', str), # Lemma reading form

158

('lemma', str), # Lemma (dictionary form)

159

('orth', str), # Orthographic form

160

('pron', str), # Pronunciation

161

('orthBase', str), # Orthographic base form

162

('pronBase', str), # Pronunciation base form

163

('goshu', str), # Word origin classification

164

('iType', str), # Inflection type

165

('iForm', str), # Inflection form

166

('fType', str), # Form type

167

('fForm', str) # Form form

168

])

169

```

170

171

```python { .api }

172

UnidicFeatures26 = NamedTuple('UnidicFeatures26', [

173

('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),

174

('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),

175

('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),

176

('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),

177

('kana', str), # Kana representation

178

('kanaBase', str), # Kana base form

179

('form', str), # Form information

180

('formBase', str), # Form base

181

('iConType', str), # Initial connection type

182

('fConType', str), # Final connection type

183

('aType', str), # Accent type

184

('aConType', str), # Accent connection type

185

('aModeType', str) # Accent mode type

186

])

187

```

188

189

```python { .api }

190

UnidicFeatures29 = NamedTuple('UnidicFeatures29', [

191

('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),

192

('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),

193

('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),

194

('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),

195

('iConType', str), ('fConType', str), ('type', str), ('kana', str), ('kanaBase', str),

196

('form', str), ('formBase', str), ('aType', str), ('aConType', str),

197

('aModType', str), # Accent modification type

198

('lid', str), # Lexicon ID

199

('lemma_id', str) # Lemma ID

200

])

201

```

202

203

## Usage Examples

204

205

### Basic Node Access

206

207

```python

208

from fugashi import Tagger

209

210

tagger = Tagger()

211

text = "美しい花が咲いている。"

212

nodes = tagger(text)

213

214

for node in nodes:

215

print(f"Surface: {node.surface}")

216

print(f"Lemma: {node.feature.lemma}")

217

print(f"POS: {node.pos}")

218

print(f"Is unknown: {node.is_unk}")

219

print(f"Length: {node.length}")

220

print("---")

221

```

222

223

### Feature Access by Schema

224

225

```python

226

from fugashi import Tagger, UnidicFeatures17

227

228

tagger = Tagger()

229

text = "走っている"

230

nodes = tagger(text)

231

232

for node in nodes:

233

feature = node.feature

234

235

# Access structured features

236

print(f"Surface: {node.surface}")

237

print(f"POS1: {feature.pos1}") # Major POS category

238

print(f"POS2: {feature.pos2}") # Middle POS category

239

print(f"Lemma: {feature.lemma}") # Dictionary form

240

print(f"Reading: {feature.pron}") # Pronunciation

241

print(f"Inflection: {feature.cType}") # Conjugation type

242

243

# Handle schema differences

244

if hasattr(feature, 'aType'):

245

print(f"Accent: {feature.aType}")

246

247

print("---")

248

```

249

250

### Working with Unknown Words

251

252

```python

253

from fugashi import Tagger

254

255

tagger = Tagger()

256

text = "日本語とmixedテキスト" # Mixed Japanese and English

257

nodes = tagger(text)

258

259

for node in nodes:

260

if node.is_unk:

261

print(f"Unknown word: {node.surface}")

262

print(f"Character type: {node.char_type}")

263

else:

264

print(f"Known word: {node.surface} -> {node.feature.lemma}")

265

```

266

267

### Whitespace and Text Reconstruction

268

269

```python

270

from fugashi import Tagger

271

272

tagger = Tagger()

273

text = "これは\tタブ文字を\n含む文章です。"

274

nodes = tagger(text)

275

276

# Reconstruct original text with whitespace

277

reconstructed = ""

278

for node in nodes:

279

reconstructed += node.surface + node.white_space

280

281

print(f"Original: {repr(text)}")

282

print(f"Reconstructed: {repr(reconstructed)}")

283

print(f"Match: {text == reconstructed}")

284

285

# Access specific whitespace

286

for i, node in enumerate(nodes):

287

if node.white_space:

288

print(f"Node {i} ({node.surface}) followed by: {repr(node.white_space)}")

289

```

290

291

### Raw Feature Analysis

292

293

```python

294

from fugashi import Tagger

295

296

tagger = Tagger()

297

text = "複雑な文法情報"

298

nodes = tagger(text)

299

300

for node in nodes:

301

print(f"Surface: {node.surface}")

302

print(f"Raw features: {node.feature_raw}")

303

print(f"Parsed features: {node.feature}")

304

print(f"POS ID: {node.posid}")

305

print(f"Node status: {node.stat}")

306

print("---")

307

```

308

309

### Node Status Handling

310

311

```python

312

from fugashi import Tagger

313

314

tagger = Tagger()

315

text = "短い文。"

316

nodes = tagger.parseToNodeList(text)

317

318

# Note: BOS/EOS nodes are typically filtered out in parseToNodeList

319

# but are present in the raw MeCab node chain

320

for node in nodes:

321

status_map = {0: "Normal", 1: "Unknown", 2: "BOS", 3: "EOS"}

322

print(f"{node.surface} (status: {status_map.get(node.stat, 'Other')})")

323

```