Cython MeCab wrapper for fast, pythonic Japanese tokenization and morphological analysis.
—
Core tokenization functionality that provides Japanese text segmentation and parsing through MeCab. These classes and methods form the foundation of fugashi's text processing capabilities.
The main tagger class with automatic UniDic support and feature format detection. Recommended for most use cases involving UniDic dictionaries.
class Tagger:
def __init__(self, arg: str = '') -> None:
"""Initialize Tagger with UniDic support and automatic feature format detection.
Args:
arg: MeCab arguments string (e.g., '-Owakati' for wakati mode)
Raises:
RuntimeError: If MeCab initialization fails
"""
...
def __call__(self, text: str) -> List[UnidicNode]:
"""Parse text and return list of UnidicNode objects. Alias for parseToNodeList.
Args:
text: Input Japanese text to tokenize
Returns:
List of UnidicNode objects representing tokens
"""
...
def parse(self, text: str) -> str:
"""Parse text and return formatted string output.
Args:
text: Input Japanese text to tokenize
Returns:
Formatted string with token information (format depends on MeCab options)
"""
...
def parseToNodeList(self, text: str) -> List[UnidicNode]:
"""Parse text and return list of UnidicNode objects.
Args:
text: Input Japanese text to tokenize
Returns:
List of UnidicNode objects with surface forms and features
"""
...
def nbest(self, text: str, num: int = 10) -> str:
"""Return n-best tokenization candidates as formatted string.
Args:
text: Input Japanese text to tokenize
num: Number of best candidates to return (default: 10)
Returns:
Formatted string with multiple tokenization options
"""
...
def nbestToNodeList(self, text: str, num: int = 10) -> List[List[UnidicNode]]:
"""Return n-best tokenization candidates as lists of nodes.
Args:
text: Input Japanese text to tokenize
num: Number of best candidates to return (default: 10)
Returns:
List of tokenization alternatives, each as a list of UnidicNode objects
"""
...
@property
def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
"""Get information about loaded dictionaries.
Returns:
List of dictionaries containing filename, charset, size, and version info
"""
...Generic tagger supporting any MeCab dictionary with customizable feature wrappers. Use when working with non-UniDic dictionaries or when custom feature handling is needed.
class GenericTagger:
def __init__(self, args: str = '', wrapper: Callable = make_tuple, quiet: bool = False) -> None:
"""Initialize GenericTagger with custom dictionary and feature wrapper.
Args:
args: MeCab arguments string including dictionary specification
wrapper: Feature wrapper function (default: make_tuple)
quiet: Suppress error details on initialization failure (default: False)
Raises:
RuntimeError: If MeCab initialization fails
"""
...
def __call__(self, text: str) -> List[Node]:
"""Parse text and return list of Node objects. Alias for parseToNodeList.
Args:
text: Input Japanese text to tokenize
Returns:
List of Node objects representing tokens
"""
...
def parse(self, text: str) -> str:
"""Parse text and return formatted string output.
Args:
text: Input Japanese text to tokenize
Returns:
Formatted string with token information (format depends on MeCab options)
"""
...
def parseToNodeList(self, text: str) -> List[Node]:
"""Parse text and return list of Node objects.
Args:
text: Input Japanese text to tokenize
Returns:
List of Node objects with surface forms and features
"""
...
def nbest(self, text: str, num: int = 10) -> str:
"""Return n-best tokenization candidates as formatted string.
Args:
text: Input Japanese text to tokenize
num: Number of best candidates to return (default: 10)
Returns:
Formatted string with multiple tokenization options
"""
...
def nbestToNodeList(self, text: str, num: int = 10) -> List[List[Node]]:
"""Return n-best tokenization candidates as lists of nodes.
Args:
text: Input Japanese text to tokenize
num: Number of best candidates to return (default: 10)
Returns:
List of tokenization alternatives, each as a list of Node objects
"""
...
@property
def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
"""Get information about loaded dictionaries.
Returns:
List of dictionaries containing filename, charset, size, and version info
"""
...from fugashi import Tagger
# Initialize with default UniDic
tagger = Tagger()
# Parse Japanese text
text = "私は学校に行きます。"
nodes = tagger(text)
for node in nodes:
print(f"{node.surface}\t{node.feature.lemma}\t{node.pos}")from fugashi import Tagger
# Initialize in wakati mode
tagger = Tagger('-Owakati')
text = "私は学校に行きます。"
result = tagger.parse(text)
print(result) # "私 は 学校 に 行き ます 。"from fugashi import Tagger
tagger = Tagger()
text = "外国人参政権"
# Get multiple tokenization candidates
candidates = tagger.nbestToNodeList(text, 3)
for i, candidate in enumerate(candidates):
tokens = [node.surface for node in candidate]
print(f"Candidate {i+1}: {' '.join(tokens)}")from fugashi import GenericTagger
# Using with IPA dictionary
tagger = GenericTagger()
text = "今日は良い天気です。"
nodes = tagger(text)
for node in nodes:
# Access features by index (varies by dictionary)
print(f"{node.surface}\t{node.feature[0]}\t{node.feature[1]}")from fugashi import GenericTagger, create_feature_wrapper
# Create custom feature wrapper
CustomFeatures = create_feature_wrapper('CustomFeatures',
['pos1', 'pos2', 'pos3', 'pos4', 'inflection', 'conjugation', 'base_form'])
# Use with generic tagger
tagger = GenericTagger(wrapper=CustomFeatures)
text = "走っている"
nodes = tagger(text)
for node in nodes:
print(f"Surface: {node.surface}")
print(f"POS: {node.feature.pos1}")
print(f"Base form: {node.feature.base_form}")Install with Tessl CLI
npx tessl i tessl/pypi-fugashi