tessl/pypi-fugashi

Cython MeCab wrapper for fast, pythonic Japanese tokenization and morphological analysis.

—

Pending

Overview

Eval results

Files

Dictionary Management

Name: tessl/pypi-fugashi
Author: tessl

Dictionary configuration, information access, and custom dictionary building that enables advanced MeCab dictionary management and extensive customization for specific use cases and research applications.

Capabilities

Dictionary Information Access

Access detailed information about loaded MeCab dictionaries including metadata, file paths, and encoding information.

class Tagger:
    @property
    def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
        """Get information about loaded dictionaries.
        
        Returns:
            List of dictionaries with keys:
            - 'filename': Path to dictionary file
            - 'charset': Character encoding used
            - 'size': Dictionary size in entries
            - 'version': Dictionary version (may not be reliable)
        """
        ...

class GenericTagger:
    @property  
    def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
        """Get information about loaded dictionaries.
        
        Returns:
            List of dictionaries with keys:
            - 'filename': Path to dictionary file
            - 'charset': Character encoding used
            - 'size': Dictionary size in entries
            - 'version': Dictionary version (may not be reliable)
        """
        ...

Feature Wrapper Creation

Create custom named tuple wrappers for dictionary features to enable structured access to morphological data from any MeCab dictionary format.

def create_feature_wrapper(name: str, fields: List[str], default: Any = None) -> NamedTuple:
    """Create a namedtuple-based wrapper for dictionary features.
    
    Sets default values to None for handling unknown words that may have
    fewer fields than the full schema.
    
    Args:
        name: Name for the resulting namedtuple class
        fields: List of field names for the features
        default: Default value for missing fields (default: None)
        
    Returns:
        Named tuple class that can be used as wrapper for GenericTagger
    """
    ...

Dictionary Discovery

Discover and import installed UniDic packages for automatic dictionary configuration.

def try_import_unidic() -> Optional[str]:
    """Import unidic or unidic-lite packages if available.
    
    Attempts to import unidic first, then unidic-lite as fallback.
    Used internally by Tagger for automatic dictionary discovery.
    
    Returns:
        Dictionary directory path if found, None if no UniDic package available
    """
    ...

Dictionary Building

Build custom user dictionaries from CSV input using MeCab's dictionary compilation functionality.

def build_dictionary(args: str) -> None:
    """Build user dictionary using MeCab's dictionary building functionality.
    
    Wraps MeCab's mecab-dict-index command for compiling custom dictionaries
    from formatted CSV input files.
    
    Args:
        args: Command line arguments for dictionary building
              (e.g., "-f utf8 -t utf8 input.csv output_dir")
              
    Raises:
        RuntimeError: If dictionary building fails
    """
    ...

Utility Functions

Helper functions for creating flexible tagger interfaces.

def make_tuple(*args) -> tuple:
    """Create tuple from variable arguments.
    
    Wrapper function that provides the same interface as namedtuple
    constructors for use as a feature wrapper in GenericTagger.
    
    Args:
        *args: Variable number of arguments
        
    Returns:
        Tuple containing all provided arguments
    """
    ...

Usage Examples

Dictionary Information Inspection

from fugashi import Tagger

tagger = Tagger()

# Get information about loaded dictionaries
for i, dict_info in enumerate(tagger.dictionary_info):
    print(f"Dictionary {i+1}:")
    print(f"  Filename: {dict_info['filename']}")
    print(f"  Charset: {dict_info['charset']}")
    print(f"  Size: {dict_info['size']:,} entries")
    print(f"  Version: {dict_info['version']}")
    print()

Custom Feature Wrapper

from fugashi import GenericTagger, create_feature_wrapper

# Create custom feature wrapper for IPA dictionary
IpaFeatures = create_feature_wrapper('IpaFeatures', [
    'pos1', 'pos2', 'pos3', 'pos4',
    'inflection_type', 'inflection_form', 
    'base_form', 'reading', 'pronunciation'
])

# Use with IPA dictionary
tagger = GenericTagger(wrapper=IpaFeatures)

text = "走っています"
nodes = tagger(text)

for node in nodes:
    print(f"Surface: {node.surface}")
    print(f"POS: {node.feature.pos1}")
    print(f"Base form: {node.feature.base_form}")
    print(f"Reading: {node.feature.reading}")
    print("---")

Working with Different Dictionary Types

from fugashi import GenericTagger, Tagger

# Default Tagger (UniDic with auto-detection)
unidic_tagger = Tagger()

# Generic tagger with tuple features
generic_tagger = GenericTagger()

# Generic tagger with specific dictionary path
custom_tagger = GenericTagger('-d /path/to/custom/dictionary')

text = "辞書を比較する"

print("UniDic features:")
nodes = unidic_tagger(text)
for node in nodes:
    print(f"{node.surface}: {node.feature.lemma}")

print("\nGeneric tuple features:")
nodes = generic_tagger(text)
for node in nodes:
    print(f"{node.surface}: {node.feature[6]}")  # Base form at index 6

Dictionary Discovery and Setup

from fugashi import try_import_unidic, Tagger

# Check for UniDic installation
unidic_path = try_import_unidic()
if unidic_path:
    print(f"UniDic found at: {unidic_path}")
    
    # Tagger will automatically use this
    tagger = Tagger()
    print("Tagger initialized with auto-discovered UniDic")
else:
    print("No UniDic package found")
    print("Install with: pip install 'fugashi[unidic-lite]'")

Building Custom Dictionary

from fugashi import build_dictionary
import os

# Prepare CSV data for custom dictionary
csv_content = """surface,left_context,right_context,cost,pos1,pos2,pos3,pos4,inflection,conjugation,base,reading,pronunciation
専門用語,1,1,5000,名詞,一般,*,*,*,*,専門用語,センモンヨウゴ,センモンヨーゴ
固有名詞,1,1,3000,名詞,固有名詞,*,*,*,*,固有名詞,コユウメイシ,コユーメーシ
"""

# Write CSV file
with open('custom_dict.csv', 'w', encoding='utf-8') as f:
    f.write(csv_content)

try:
    # Build dictionary
    build_dictionary('-f utf8 -t utf8 custom_dict.csv custom_dict_dir')
    print("Custom dictionary built successfully")
    
    # Use custom dictionary
    from fugashi import GenericTagger
    tagger = GenericTagger(f'-d {os.path.abspath("custom_dict_dir")}')
    
    result = tagger.parse("専門用語の解析")
    print(f"Result: {result}")
    
except Exception as e:
    print(f"Dictionary building failed: {e}")
finally:
    # Cleanup
    if os.path.exists('custom_dict.csv'):
        os.remove('custom_dict.csv')

Advanced Dictionary Configuration

from fugashi import GenericTagger

# Multiple dictionaries (system + user)
args = '-d /path/to/system/dict -u /path/to/user/dict1 -u /path/to/user/dict2'
tagger = GenericTagger(args)

# Different output formats
wakati_tagger = GenericTagger('-Owakati')  # Space-separated tokens
yomi_tagger = GenericTagger('-Oyomi')      # Reading only
node_tagger = GenericTagger('-Onode')      # Node format

text = "複数の辞書設定"

print("Wakati:", wakati_tagger.parse(text))
print("Yomi:", yomi_tagger.parse(text))
print("Node:", node_tagger.parse(text))

# Check what dictionaries are loaded
for i, dict_info in enumerate(tagger.dictionary_info):
    dict_type = "System" if i == 0 else f"User {i}"
    print(f"{dict_type} dictionary: {dict_info['filename']}")

Feature Wrapper for Unknown Words

from fugashi import GenericTagger, create_feature_wrapper

# Create wrapper that handles variable field counts
FlexibleFeatures = create_feature_wrapper('FlexibleFeatures', [
    'pos1', 'pos2', 'pos3', 'pos4', 'pos5', 'pos6', 
    'base_form', 'reading', 'pronunciation'
], default='*')  # Use '*' as default instead of None

tagger = GenericTagger(wrapper=FlexibleFeatures)

text = "日本語とEnglishのmixed文章"
nodes = tagger(text)

for node in nodes:
    print(f"Surface: {node.surface}")
    print(f"POS1: {node.feature.pos1}")
    print(f"Base: {node.feature.base_form}")
    print(f"Unknown: {node.is_unk}")
    print("---")

Install with Tessl CLI