Cython MeCab wrapper for fast, pythonic Japanese tokenization and morphological analysis.
—
Dictionary configuration, information access, and custom dictionary building that enables advanced MeCab dictionary management and extensive customization for specific use cases and research applications.
Access detailed information about loaded MeCab dictionaries including metadata, file paths, and encoding information.
class Tagger:
@property
def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
"""Get information about loaded dictionaries.
Returns:
List of dictionaries with keys:
- 'filename': Path to dictionary file
- 'charset': Character encoding used
- 'size': Dictionary size in entries
- 'version': Dictionary version (may not be reliable)
"""
...
class GenericTagger:
@property
def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
"""Get information about loaded dictionaries.
Returns:
List of dictionaries with keys:
- 'filename': Path to dictionary file
- 'charset': Character encoding used
- 'size': Dictionary size in entries
- 'version': Dictionary version (may not be reliable)
"""
...Create custom named tuple wrappers for dictionary features to enable structured access to morphological data from any MeCab dictionary format.
def create_feature_wrapper(name: str, fields: List[str], default: Any = None) -> NamedTuple:
"""Create a namedtuple-based wrapper for dictionary features.
Sets default values to None for handling unknown words that may have
fewer fields than the full schema.
Args:
name: Name for the resulting namedtuple class
fields: List of field names for the features
default: Default value for missing fields (default: None)
Returns:
Named tuple class that can be used as wrapper for GenericTagger
"""
...Discover and import installed UniDic packages for automatic dictionary configuration.
def try_import_unidic() -> Optional[str]:
"""Import unidic or unidic-lite packages if available.
Attempts to import unidic first, then unidic-lite as fallback.
Used internally by Tagger for automatic dictionary discovery.
Returns:
Dictionary directory path if found, None if no UniDic package available
"""
...Build custom user dictionaries from CSV input using MeCab's dictionary compilation functionality.
def build_dictionary(args: str) -> None:
"""Build user dictionary using MeCab's dictionary building functionality.
Wraps MeCab's mecab-dict-index command for compiling custom dictionaries
from formatted CSV input files.
Args:
args: Command line arguments for dictionary building
(e.g., "-f utf8 -t utf8 input.csv output_dir")
Raises:
RuntimeError: If dictionary building fails
"""
...Helper functions for creating flexible tagger interfaces.
def make_tuple(*args) -> tuple:
"""Create tuple from variable arguments.
Wrapper function that provides the same interface as namedtuple
constructors for use as a feature wrapper in GenericTagger.
Args:
*args: Variable number of arguments
Returns:
Tuple containing all provided arguments
"""
...from fugashi import Tagger
tagger = Tagger()
# Get information about loaded dictionaries
for i, dict_info in enumerate(tagger.dictionary_info):
print(f"Dictionary {i+1}:")
print(f" Filename: {dict_info['filename']}")
print(f" Charset: {dict_info['charset']}")
print(f" Size: {dict_info['size']:,} entries")
print(f" Version: {dict_info['version']}")
print()from fugashi import GenericTagger, create_feature_wrapper
# Create custom feature wrapper for IPA dictionary
IpaFeatures = create_feature_wrapper('IpaFeatures', [
'pos1', 'pos2', 'pos3', 'pos4',
'inflection_type', 'inflection_form',
'base_form', 'reading', 'pronunciation'
])
# Use with IPA dictionary
tagger = GenericTagger(wrapper=IpaFeatures)
text = "走っています"
nodes = tagger(text)
for node in nodes:
print(f"Surface: {node.surface}")
print(f"POS: {node.feature.pos1}")
print(f"Base form: {node.feature.base_form}")
print(f"Reading: {node.feature.reading}")
print("---")from fugashi import GenericTagger, Tagger
# Default Tagger (UniDic with auto-detection)
unidic_tagger = Tagger()
# Generic tagger with tuple features
generic_tagger = GenericTagger()
# Generic tagger with specific dictionary path
custom_tagger = GenericTagger('-d /path/to/custom/dictionary')
text = "辞書を比較する"
print("UniDic features:")
nodes = unidic_tagger(text)
for node in nodes:
print(f"{node.surface}: {node.feature.lemma}")
print("\nGeneric tuple features:")
nodes = generic_tagger(text)
for node in nodes:
print(f"{node.surface}: {node.feature[6]}") # Base form at index 6from fugashi import try_import_unidic, Tagger
# Check for UniDic installation
unidic_path = try_import_unidic()
if unidic_path:
print(f"UniDic found at: {unidic_path}")
# Tagger will automatically use this
tagger = Tagger()
print("Tagger initialized with auto-discovered UniDic")
else:
print("No UniDic package found")
print("Install with: pip install 'fugashi[unidic-lite]'")from fugashi import build_dictionary
import os
# Prepare CSV data for custom dictionary
csv_content = """surface,left_context,right_context,cost,pos1,pos2,pos3,pos4,inflection,conjugation,base,reading,pronunciation
専門用語,1,1,5000,名詞,一般,*,*,*,*,専門用語,センモンヨウゴ,センモンヨーゴ
固有名詞,1,1,3000,名詞,固有名詞,*,*,*,*,固有名詞,コユウメイシ,コユーメーシ
"""
# Write CSV file
with open('custom_dict.csv', 'w', encoding='utf-8') as f:
f.write(csv_content)
try:
# Build dictionary
build_dictionary('-f utf8 -t utf8 custom_dict.csv custom_dict_dir')
print("Custom dictionary built successfully")
# Use custom dictionary
from fugashi import GenericTagger
tagger = GenericTagger(f'-d {os.path.abspath("custom_dict_dir")}')
result = tagger.parse("専門用語の解析")
print(f"Result: {result}")
except Exception as e:
print(f"Dictionary building failed: {e}")
finally:
# Cleanup
if os.path.exists('custom_dict.csv'):
os.remove('custom_dict.csv')from fugashi import GenericTagger
# Multiple dictionaries (system + user)
args = '-d /path/to/system/dict -u /path/to/user/dict1 -u /path/to/user/dict2'
tagger = GenericTagger(args)
# Different output formats
wakati_tagger = GenericTagger('-Owakati') # Space-separated tokens
yomi_tagger = GenericTagger('-Oyomi') # Reading only
node_tagger = GenericTagger('-Onode') # Node format
text = "複数の辞書設定"
print("Wakati:", wakati_tagger.parse(text))
print("Yomi:", yomi_tagger.parse(text))
print("Node:", node_tagger.parse(text))
# Check what dictionaries are loaded
for i, dict_info in enumerate(tagger.dictionary_info):
dict_type = "System" if i == 0 else f"User {i}"
print(f"{dict_type} dictionary: {dict_info['filename']}")from fugashi import GenericTagger, create_feature_wrapper
# Create wrapper that handles variable field counts
FlexibleFeatures = create_feature_wrapper('FlexibleFeatures', [
'pos1', 'pos2', 'pos3', 'pos4', 'pos5', 'pos6',
'base_form', 'reading', 'pronunciation'
], default='*') # Use '*' as default instead of None
tagger = GenericTagger(wrapper=FlexibleFeatures)
text = "日本語とEnglishのmixed文章"
nodes = tagger(text)
for node in nodes:
print(f"Surface: {node.surface}")
print(f"POS1: {node.feature.pos1}")
print(f"Base: {node.feature.base_form}")
print(f"Unknown: {node.is_unk}")
print("---")Install with Tessl CLI
npx tessl i tessl/pypi-fugashi