Cython MeCab wrapper for fast, pythonic Japanese tokenization and morphological analysis.
—
Token representation and morphological feature access that provides detailed linguistic information for each tokenized element. These classes and structures enable comprehensive analysis of Japanese text morphology.
Base node class representing a single token from MeCab tokenization with access to surface forms, morphological features, and metadata.
class Node:
@property
def surface(self) -> str:
"""Surface form of the token (the actual text).
Returns:
The surface string of the token
"""
...
@surface.setter
def surface(self, value: str) -> None:
"""Set the surface form of the token.
Args:
value: New surface form string
"""
...
@property
def feature(self) -> NamedTuple:
"""Parsed feature data from the dictionary as a named tuple.
Returns:
Named tuple containing morphological features (structure depends on dictionary)
"""
...
@property
def feature_raw(self) -> str:
"""Raw feature string from MeCab without parsing.
Returns:
Comma-separated feature string as returned by MeCab
"""
...
@property
def length(self) -> int:
"""Length of the surface form in bytes.
Returns:
Byte length of the token surface
"""
...
@property
def rlength(self) -> int:
"""Total length including trailing whitespace in bytes.
Returns:
Total byte length including whitespace
"""
...
@property
def posid(self) -> int:
"""Part-of-speech ID from MeCab.
Returns:
Numeric POS identifier
"""
...
@property
def char_type(self) -> int:
"""Character type classification from MeCab.
Returns:
Numeric character type code
"""
...
@property
def stat(self) -> int:
"""Node status from MeCab.
Returns:
Status code: 0=normal, 1=unknown, 2=BOS (beginning of sentence), 3=EOS (end of sentence)
"""
...
@property
def is_unk(self) -> bool:
"""Whether the token is unknown to the dictionary.
Returns:
True if the token was not found in the dictionary
"""
...
@property
def white_space(self) -> str:
"""Whitespace characters following this token.
Returns:
String containing trailing whitespace
"""
...
@white_space.setter
def white_space(self, value: str) -> None:
"""Set the whitespace following this token.
Args:
value: Whitespace string
"""
...
def __repr__(self) -> str:
"""String representation of the node.
Returns:
Surface form or special markers for BOS/EOS nodes
"""
...UniDic-specific node class that extends Node with additional convenience methods for UniDic dictionary features.
class UnidicNode(Node):
@property
def pos(self) -> str:
"""Four-field part-of-speech value formatted as comma-separated string.
Returns:
POS string in format "pos1,pos2,pos3,pos4"
"""
...Named tuple structures providing structured access to UniDic dictionary features across different schema versions.
UnidicFeatures17 = NamedTuple('UnidicFeatures17', [
('pos1', str), # Major part-of-speech category
('pos2', str), # Middle part-of-speech category
('pos3', str), # Minor part-of-speech category
('pos4', str), # Sub part-of-speech category
('cType', str), # Conjugation type
('cForm', str), # Conjugation form
('lForm', str), # Lemma reading form
('lemma', str), # Lemma (dictionary form)
('orth', str), # Orthographic form
('pron', str), # Pronunciation
('orthBase', str), # Orthographic base form
('pronBase', str), # Pronunciation base form
('goshu', str), # Word origin classification
('iType', str), # Inflection type
('iForm', str), # Inflection form
('fType', str), # Form type
('fForm', str) # Form form
])UnidicFeatures26 = NamedTuple('UnidicFeatures26', [
('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),
('kana', str), # Kana representation
('kanaBase', str), # Kana base form
('form', str), # Form information
('formBase', str), # Form base
('iConType', str), # Initial connection type
('fConType', str), # Final connection type
('aType', str), # Accent type
('aConType', str), # Accent connection type
('aModeType', str) # Accent mode type
])UnidicFeatures29 = NamedTuple('UnidicFeatures29', [
('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),
('iConType', str), ('fConType', str), ('type', str), ('kana', str), ('kanaBase', str),
('form', str), ('formBase', str), ('aType', str), ('aConType', str),
('aModType', str), # Accent modification type
('lid', str), # Lexicon ID
('lemma_id', str) # Lemma ID
])from fugashi import Tagger
tagger = Tagger()
text = "美しい花が咲いている。"
nodes = tagger(text)
for node in nodes:
print(f"Surface: {node.surface}")
print(f"Lemma: {node.feature.lemma}")
print(f"POS: {node.pos}")
print(f"Is unknown: {node.is_unk}")
print(f"Length: {node.length}")
print("---")from fugashi import Tagger, UnidicFeatures17
tagger = Tagger()
text = "走っている"
nodes = tagger(text)
for node in nodes:
feature = node.feature
# Access structured features
print(f"Surface: {node.surface}")
print(f"POS1: {feature.pos1}") # Major POS category
print(f"POS2: {feature.pos2}") # Middle POS category
print(f"Lemma: {feature.lemma}") # Dictionary form
print(f"Reading: {feature.pron}") # Pronunciation
print(f"Inflection: {feature.cType}") # Conjugation type
# Handle schema differences
if hasattr(feature, 'aType'):
print(f"Accent: {feature.aType}")
print("---")from fugashi import Tagger
tagger = Tagger()
text = "日本語とmixedテキスト" # Mixed Japanese and English
nodes = tagger(text)
for node in nodes:
if node.is_unk:
print(f"Unknown word: {node.surface}")
print(f"Character type: {node.char_type}")
else:
print(f"Known word: {node.surface} -> {node.feature.lemma}")from fugashi import Tagger
tagger = Tagger()
text = "これは\tタブ文字を\n含む文章です。"
nodes = tagger(text)
# Reconstruct original text with whitespace
reconstructed = ""
for node in nodes:
reconstructed += node.surface + node.white_space
print(f"Original: {repr(text)}")
print(f"Reconstructed: {repr(reconstructed)}")
print(f"Match: {text == reconstructed}")
# Access specific whitespace
for i, node in enumerate(nodes):
if node.white_space:
print(f"Node {i} ({node.surface}) followed by: {repr(node.white_space)}")from fugashi import Tagger
tagger = Tagger()
text = "複雑な文法情報"
nodes = tagger(text)
for node in nodes:
print(f"Surface: {node.surface}")
print(f"Raw features: {node.feature_raw}")
print(f"Parsed features: {node.feature}")
print(f"POS ID: {node.posid}")
print(f"Node status: {node.stat}")
print("---")from fugashi import Tagger
tagger = Tagger()
text = "短い文。"
nodes = tagger.parseToNodeList(text)
# Note: BOS/EOS nodes are typically filtered out in parseToNodeList
# but are present in the raw MeCab node chain
for node in nodes:
status_map = {0: "Normal", 1: "Unknown", 2: "BOS", 3: "EOS"}
print(f"{node.surface} (status: {status_map.get(node.stat, 'Other')})")Install with Tessl CLI
npx tessl i tessl/pypi-fugashi