tessl/pypi-fugashi

Cython MeCab wrapper for fast, pythonic Japanese tokenization and morphological analysis.

—

Pending

Overview

Eval results

Files

Tokenization

Name: tessl/pypi-fugashi
Author: tessl

Core tokenization functionality that provides Japanese text segmentation and parsing through MeCab. These classes and methods form the foundation of fugashi's text processing capabilities.

Capabilities

Tagger Class

The main tagger class with automatic UniDic support and feature format detection. Recommended for most use cases involving UniDic dictionaries.

class Tagger:
    def __init__(self, arg: str = '') -> None:
        """Initialize Tagger with UniDic support and automatic feature format detection.
        
        Args:
            arg: MeCab arguments string (e.g., '-Owakati' for wakati mode)
        
        Raises:
            RuntimeError: If MeCab initialization fails
        """
        ...
    
    def __call__(self, text: str) -> List[UnidicNode]:
        """Parse text and return list of UnidicNode objects. Alias for parseToNodeList.
        
        Args:
            text: Input Japanese text to tokenize
            
        Returns:
            List of UnidicNode objects representing tokens
        """
        ...
    
    def parse(self, text: str) -> str:
        """Parse text and return formatted string output.
        
        Args:
            text: Input Japanese text to tokenize
            
        Returns:
            Formatted string with token information (format depends on MeCab options)
        """
        ...
    
    def parseToNodeList(self, text: str) -> List[UnidicNode]:
        """Parse text and return list of UnidicNode objects.
        
        Args:
            text: Input Japanese text to tokenize
            
        Returns:
            List of UnidicNode objects with surface forms and features
        """
        ...
    
    def nbest(self, text: str, num: int = 10) -> str:
        """Return n-best tokenization candidates as formatted string.
        
        Args:
            text: Input Japanese text to tokenize
            num: Number of best candidates to return (default: 10)
            
        Returns:
            Formatted string with multiple tokenization options
        """
        ...
    
    def nbestToNodeList(self, text: str, num: int = 10) -> List[List[UnidicNode]]:
        """Return n-best tokenization candidates as lists of nodes.
        
        Args:
            text: Input Japanese text to tokenize
            num: Number of best candidates to return (default: 10)
            
        Returns:
            List of tokenization alternatives, each as a list of UnidicNode objects
        """
        ...
    
    @property
    def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
        """Get information about loaded dictionaries.
        
        Returns:
            List of dictionaries containing filename, charset, size, and version info
        """
        ...

GenericTagger Class

Generic tagger supporting any MeCab dictionary with customizable feature wrappers. Use when working with non-UniDic dictionaries or when custom feature handling is needed.

class GenericTagger:
    def __init__(self, args: str = '', wrapper: Callable = make_tuple, quiet: bool = False) -> None:
        """Initialize GenericTagger with custom dictionary and feature wrapper.
        
        Args:
            args: MeCab arguments string including dictionary specification
            wrapper: Feature wrapper function (default: make_tuple)
            quiet: Suppress error details on initialization failure (default: False)
            
        Raises:
            RuntimeError: If MeCab initialization fails
        """
        ...
    
    def __call__(self, text: str) -> List[Node]:
        """Parse text and return list of Node objects. Alias for parseToNodeList.
        
        Args:
            text: Input Japanese text to tokenize
            
        Returns:
            List of Node objects representing tokens
        """
        ...
    
    def parse(self, text: str) -> str:
        """Parse text and return formatted string output.
        
        Args:
            text: Input Japanese text to tokenize
            
        Returns:
            Formatted string with token information (format depends on MeCab options)
        """
        ...
    
    def parseToNodeList(self, text: str) -> List[Node]:
        """Parse text and return list of Node objects.
        
        Args:
            text: Input Japanese text to tokenize
            
        Returns:
            List of Node objects with surface forms and features
        """
        ...
    
    def nbest(self, text: str, num: int = 10) -> str:
        """Return n-best tokenization candidates as formatted string.
        
        Args:
            text: Input Japanese text to tokenize
            num: Number of best candidates to return (default: 10)
            
        Returns:
            Formatted string with multiple tokenization options
        """
        ...
    
    def nbestToNodeList(self, text: str, num: int = 10) -> List[List[Node]]:
        """Return n-best tokenization candidates as lists of nodes.
        
        Args:
            text: Input Japanese text to tokenize
            num: Number of best candidates to return (default: 10)
            
        Returns:
            List of tokenization alternatives, each as a list of Node objects
        """
        ...
    
    @property
    def dictionary_info(self) -> List[Dict[str, Union[str, int]]]:
        """Get information about loaded dictionaries.
        
        Returns:
            List of dictionaries containing filename, charset, size, and version info
        """
        ...

Usage Examples

Basic Tokenization

from fugashi import Tagger

# Initialize with default UniDic
tagger = Tagger()

# Parse Japanese text
text = "私は学校に行きます。"
nodes = tagger(text)

for node in nodes:
    print(f"{node.surface}\t{node.feature.lemma}\t{node.pos}")

Wakati Mode (Word Segmentation)

from fugashi import Tagger

# Initialize in wakati mode
tagger = Tagger('-Owakati')

text = "私は学校に行きます。"
result = tagger.parse(text)
print(result)  # "私 は 学校 に 行き ます 。"

N-Best Parsing

from fugashi import Tagger

tagger = Tagger()
text = "外国人参政権"

# Get multiple tokenization candidates
candidates = tagger.nbestToNodeList(text, 3)

for i, candidate in enumerate(candidates):
    tokens = [node.surface for node in candidate]
    print(f"Candidate {i+1}: {' '.join(tokens)}")

Generic Dictionary Usage

from fugashi import GenericTagger

# Using with IPA dictionary
tagger = GenericTagger()

text = "今日は良い天気です。"
nodes = tagger(text)

for node in nodes:
    # Access features by index (varies by dictionary)
    print(f"{node.surface}\t{node.feature[0]}\t{node.feature[1]}")

Custom Feature Wrapper

from fugashi import GenericTagger, create_feature_wrapper

# Create custom feature wrapper
CustomFeatures = create_feature_wrapper('CustomFeatures', 
    ['pos1', 'pos2', 'pos3', 'pos4', 'inflection', 'conjugation', 'base_form'])

# Use with generic tagger
tagger = GenericTagger(wrapper=CustomFeatures)

text = "走っている"
nodes = tagger(text)

for node in nodes:
    print(f"Surface: {node.surface}")
    print(f"POS: {node.feature.pos1}")
    print(f"Base form: {node.feature.base_form}")

Install with Tessl CLI