0
# Nodes and Features
1
2
Token representation and morphological feature access that provides detailed linguistic information for each tokenized element. These classes and structures enable comprehensive analysis of Japanese text morphology.
3
4
## Capabilities
5
6
### Node Class
7
8
Base node class representing a single token from MeCab tokenization with access to surface forms, morphological features, and metadata.
9
10
```python { .api }
11
class Node:
12
@property
13
def surface(self) -> str:
14
"""Surface form of the token (the actual text).
15
16
Returns:
17
The surface string of the token
18
"""
19
...
20
21
@surface.setter
22
def surface(self, value: str) -> None:
23
"""Set the surface form of the token.
24
25
Args:
26
value: New surface form string
27
"""
28
...
29
30
@property
31
def feature(self) -> NamedTuple:
32
"""Parsed feature data from the dictionary as a named tuple.
33
34
Returns:
35
Named tuple containing morphological features (structure depends on dictionary)
36
"""
37
...
38
39
@property
40
def feature_raw(self) -> str:
41
"""Raw feature string from MeCab without parsing.
42
43
Returns:
44
Comma-separated feature string as returned by MeCab
45
"""
46
...
47
48
@property
49
def length(self) -> int:
50
"""Length of the surface form in bytes.
51
52
Returns:
53
Byte length of the token surface
54
"""
55
...
56
57
@property
58
def rlength(self) -> int:
59
"""Total length including trailing whitespace in bytes.
60
61
Returns:
62
Total byte length including whitespace
63
"""
64
...
65
66
@property
67
def posid(self) -> int:
68
"""Part-of-speech ID from MeCab.
69
70
Returns:
71
Numeric POS identifier
72
"""
73
...
74
75
@property
76
def char_type(self) -> int:
77
"""Character type classification from MeCab.
78
79
Returns:
80
Numeric character type code
81
"""
82
...
83
84
@property
85
def stat(self) -> int:
86
"""Node status from MeCab.
87
88
Returns:
89
Status code: 0=normal, 1=unknown, 2=BOS (beginning of sentence), 3=EOS (end of sentence)
90
"""
91
...
92
93
@property
94
def is_unk(self) -> bool:
95
"""Whether the token is unknown to the dictionary.
96
97
Returns:
98
True if the token was not found in the dictionary
99
"""
100
...
101
102
@property
103
def white_space(self) -> str:
104
"""Whitespace characters following this token.
105
106
Returns:
107
String containing trailing whitespace
108
"""
109
...
110
111
@white_space.setter
112
def white_space(self, value: str) -> None:
113
"""Set the whitespace following this token.
114
115
Args:
116
value: Whitespace string
117
"""
118
...
119
120
def __repr__(self) -> str:
121
"""String representation of the node.
122
123
Returns:
124
Surface form or special markers for BOS/EOS nodes
125
"""
126
...
127
```
128
129
### UnidicNode Class
130
131
UniDic-specific node class that extends Node with additional convenience methods for UniDic dictionary features.
132
133
```python { .api }
134
class UnidicNode(Node):
135
@property
136
def pos(self) -> str:
137
"""Four-field part-of-speech value formatted as comma-separated string.
138
139
Returns:
140
POS string in format "pos1,pos2,pos3,pos4"
141
"""
142
...
143
```
144
145
### UniDic Feature Structures
146
147
Named tuple structures providing structured access to UniDic dictionary features across different schema versions.
148
149
```python { .api }
150
UnidicFeatures17 = NamedTuple('UnidicFeatures17', [
151
('pos1', str), # Major part-of-speech category
152
('pos2', str), # Middle part-of-speech category
153
('pos3', str), # Minor part-of-speech category
154
('pos4', str), # Sub part-of-speech category
155
('cType', str), # Conjugation type
156
('cForm', str), # Conjugation form
157
('lForm', str), # Lemma reading form
158
('lemma', str), # Lemma (dictionary form)
159
('orth', str), # Orthographic form
160
('pron', str), # Pronunciation
161
('orthBase', str), # Orthographic base form
162
('pronBase', str), # Pronunciation base form
163
('goshu', str), # Word origin classification
164
('iType', str), # Inflection type
165
('iForm', str), # Inflection form
166
('fType', str), # Form type
167
('fForm', str) # Form form
168
])
169
```
170
171
```python { .api }
172
UnidicFeatures26 = NamedTuple('UnidicFeatures26', [
173
('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
174
('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
175
('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
176
('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),
177
('kana', str), # Kana representation
178
('kanaBase', str), # Kana base form
179
('form', str), # Form information
180
('formBase', str), # Form base
181
('iConType', str), # Initial connection type
182
('fConType', str), # Final connection type
183
('aType', str), # Accent type
184
('aConType', str), # Accent connection type
185
('aModeType', str) # Accent mode type
186
])
187
```
188
189
```python { .api }
190
UnidicFeatures29 = NamedTuple('UnidicFeatures29', [
191
('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
192
('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
193
('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
194
('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),
195
('iConType', str), ('fConType', str), ('type', str), ('kana', str), ('kanaBase', str),
196
('form', str), ('formBase', str), ('aType', str), ('aConType', str),
197
('aModType', str), # Accent modification type
198
('lid', str), # Lexicon ID
199
('lemma_id', str) # Lemma ID
200
])
201
```
202
203
## Usage Examples
204
205
### Basic Node Access
206
207
```python
208
from fugashi import Tagger
209
210
tagger = Tagger()
211
text = "美しい花が咲いている。"
212
nodes = tagger(text)
213
214
for node in nodes:
215
print(f"Surface: {node.surface}")
216
print(f"Lemma: {node.feature.lemma}")
217
print(f"POS: {node.pos}")
218
print(f"Is unknown: {node.is_unk}")
219
print(f"Length: {node.length}")
220
print("---")
221
```
222
223
### Feature Access by Schema
224
225
```python
226
from fugashi import Tagger, UnidicFeatures17
227
228
tagger = Tagger()
229
text = "走っている"
230
nodes = tagger(text)
231
232
for node in nodes:
233
feature = node.feature
234
235
# Access structured features
236
print(f"Surface: {node.surface}")
237
print(f"POS1: {feature.pos1}") # Major POS category
238
print(f"POS2: {feature.pos2}") # Middle POS category
239
print(f"Lemma: {feature.lemma}") # Dictionary form
240
print(f"Reading: {feature.pron}") # Pronunciation
241
print(f"Inflection: {feature.cType}") # Conjugation type
242
243
# Handle schema differences
244
if hasattr(feature, 'aType'):
245
print(f"Accent: {feature.aType}")
246
247
print("---")
248
```
249
250
### Working with Unknown Words
251
252
```python
253
from fugashi import Tagger
254
255
tagger = Tagger()
256
text = "日本語とmixedテキスト" # Mixed Japanese and English
257
nodes = tagger(text)
258
259
for node in nodes:
260
if node.is_unk:
261
print(f"Unknown word: {node.surface}")
262
print(f"Character type: {node.char_type}")
263
else:
264
print(f"Known word: {node.surface} -> {node.feature.lemma}")
265
```
266
267
### Whitespace and Text Reconstruction
268
269
```python
270
from fugashi import Tagger
271
272
tagger = Tagger()
273
text = "これは\tタブ文字を\n含む文章です。"
274
nodes = tagger(text)
275
276
# Reconstruct original text with whitespace
277
reconstructed = ""
278
for node in nodes:
279
reconstructed += node.surface + node.white_space
280
281
print(f"Original: {repr(text)}")
282
print(f"Reconstructed: {repr(reconstructed)}")
283
print(f"Match: {text == reconstructed}")
284
285
# Access specific whitespace
286
for i, node in enumerate(nodes):
287
if node.white_space:
288
print(f"Node {i} ({node.surface}) followed by: {repr(node.white_space)}")
289
```
290
291
### Raw Feature Analysis
292
293
```python
294
from fugashi import Tagger
295
296
tagger = Tagger()
297
text = "複雑な文法情報"
298
nodes = tagger(text)
299
300
for node in nodes:
301
print(f"Surface: {node.surface}")
302
print(f"Raw features: {node.feature_raw}")
303
print(f"Parsed features: {node.feature}")
304
print(f"POS ID: {node.posid}")
305
print(f"Node status: {node.stat}")
306
print("---")
307
```
308
309
### Node Status Handling
310
311
```python
312
from fugashi import Tagger
313
314
tagger = Tagger()
315
text = "短い文。"
316
nodes = tagger.parseToNodeList(text)
317
318
# Note: BOS/EOS nodes are typically filtered out in parseToNodeList
319
# but are present in the raw MeCab node chain
320
for node in nodes:
321
status_map = {0: "Normal", 1: "Unknown", 2: "BOS", 3: "EOS"}
322
print(f"{node.surface} (status: {status_map.get(node.stat, 'Other')})")
323
```