Cython MeCab wrapper for fast, pythonic Japanese tokenization and morphological analysis.
npx @tessl/cli install tessl/pypi-fugashi@1.5.0A high-performance Cython wrapper for MeCab, providing fast and pythonic Japanese tokenization and morphological analysis. Fugashi offers comprehensive access to MeCab's tokenization capabilities with built-in support for UniDic dictionaries and extensive morphological feature extraction.
pip install fugashipip install 'fugashi[unidic-lite]')import fugashi
from fugashi import Tagger, GenericTagger, Node, UnidicNodeFor basic tokenization:
from fugashi import TaggerFor advanced dictionary management:
from fugashi import GenericTagger, create_feature_wrapperfrom fugashi import Tagger
# Initialize tagger with UniDic (automatic detection)
tagger = Tagger()
# Tokenize text
text = "麩菓子は、麩を主材料とした日本の菓子。"
nodes = tagger(text)
# Access token information
for node in nodes:
print(f"Surface: {node.surface}")
print(f"Lemma: {node.feature.lemma}")
print(f"POS: {node.pos}")
print(f"Features: {node.feature}")
print("---")
# Get formatted output
formatted = tagger.parse(text)
print(formatted) # Traditional MeCab output format
# Wakati (word-segmented) mode
wakati_tagger = Tagger('-Owakati')
words = wakati_tagger.parse(text)
print(words) # Space-separated tokensFugashi provides a layered architecture for Japanese text processing:
This design enables both simple tokenization workflows and sophisticated morphological analysis applications, with automatic dictionary format detection and extensive customization options.
Primary tokenization functionality including text parsing, node list generation, wakati mode, and n-best parsing. These functions provide the essential Japanese text processing capabilities.
class Tagger:
def __init__(self, arg: str = '') -> None: ...
def __call__(self, text: str) -> List[UnidicNode]: ...
def parse(self, text: str) -> str: ...
def parseToNodeList(self, text: str) -> List[UnidicNode]: ...
def nbest(self, text: str, num: int = 10) -> str: ...
def nbestToNodeList(self, text: str, num: int = 10) -> List[List[UnidicNode]]: ...
class GenericTagger:
def __init__(self, args: str = '', wrapper: Callable = make_tuple, quiet: bool = False) -> None: ...
def __call__(self, text: str) -> List[Node]: ...
def parse(self, text: str) -> str: ...
def parseToNodeList(self, text: str) -> List[Node]: ...
def nbest(self, text: str, num: int = 10) -> str: ...
def nbestToNodeList(self, text: str, num: int = 10) -> List[List[Node]]: ...Token representation and morphological feature access including surface forms, part-of-speech information, lemmas, pronunciation data, and grammatical features. These provide detailed linguistic information for each token.
class Node:
@property
def surface(self) -> str: ...
@property
def feature(self) -> NamedTuple: ...
@property
def feature_raw(self) -> str: ...
@property
def length(self) -> int: ...
@property
def rlength(self) -> int: ...
@property
def posid(self) -> int: ...
@property
def char_type(self) -> int: ...
@property
def stat(self) -> int: ...
@property
def is_unk(self) -> bool: ...
@property
def white_space(self) -> str: ...
class UnidicNode(Node):
@property
def pos(self) -> str: ...
UnidicFeatures17 = NamedTuple('UnidicFeatures17', [
('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str)
])
UnidicFeatures26 = NamedTuple('UnidicFeatures26', [
('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),
('kana', str), ('kanaBase', str), ('form', str), ('formBase', str),
('iConType', str), ('fConType', str), ('aType', str), ('aConType', str), ('aModeType', str)
])
UnidicFeatures29 = NamedTuple('UnidicFeatures29', [
('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),
('iConType', str), ('fConType', str), ('type', str), ('kana', str), ('kanaBase', str),
('form', str), ('formBase', str), ('aType', str), ('aConType', str),
('aModType', str), ('lid', str), ('lemma_id', str)
])Dictionary configuration, information access, and custom dictionary building. These functions enable advanced dictionary management and customization for specific use cases.
def create_feature_wrapper(name: str, fields: List[str], default: Any = None) -> NamedTuple: ...
def try_import_unidic() -> Optional[str]: ...
def build_dictionary(args: str) -> None: ...
class Tagger:
@property
def dictionary_info(self) -> List[Dict[str, Union[str, int]]]: ...
class GenericTagger:
@property
def dictionary_info(self) -> List[Dict[str, Union[str, int]]]: ...Console scripts for command-line text processing, dictionary information, and dictionary building. These provide direct access to fugashi functionality from the terminal.
def main():
"""Command-line interface for text tokenization.
Console script: fugashi
Processes text from stdin, treating each line as a sentence.
Supports all MeCab options via command-line arguments.
Examples:
echo "日本語" | fugashi
echo "日本語" | fugashi -Owakati
"""
...
def info():
"""Display dictionary and configuration information.
Console script: fugashi-info
Shows detailed information about loaded dictionaries including
version, size, charset, and file paths.
Example:
fugashi-info
"""
...
def build_dict():
"""Build custom MeCab user dictionary from CSV input.
Console script: fugashi-build-dict
Compiles CSV dictionary sources into MeCab binary format.
Defaults to UTF-8 encoding for input and output.
Example:
fugashi-build-dict -o custom.dic input.csv
"""
...