tessl/pypi-fugashi

Cython MeCab wrapper for fast, pythonic Japanese tokenization and morphological analysis.

—

Pending

Overview

Eval results

Files

Nodes and Features

Name: tessl/pypi-fugashi
Author: tessl

Token representation and morphological feature access that provides detailed linguistic information for each tokenized element. These classes and structures enable comprehensive analysis of Japanese text morphology.

Capabilities

Node Class

Base node class representing a single token from MeCab tokenization with access to surface forms, morphological features, and metadata.

class Node:
    @property
    def surface(self) -> str:
        """Surface form of the token (the actual text).
        
        Returns:
            The surface string of the token
        """
        ...
    
    @surface.setter
    def surface(self, value: str) -> None:
        """Set the surface form of the token.
        
        Args:
            value: New surface form string
        """
        ...
    
    @property
    def feature(self) -> NamedTuple:
        """Parsed feature data from the dictionary as a named tuple.
        
        Returns:
            Named tuple containing morphological features (structure depends on dictionary)
        """
        ...
    
    @property
    def feature_raw(self) -> str:
        """Raw feature string from MeCab without parsing.
        
        Returns:
            Comma-separated feature string as returned by MeCab
        """
        ...
    
    @property
    def length(self) -> int:
        """Length of the surface form in bytes.
        
        Returns:
            Byte length of the token surface
        """
        ...
    
    @property
    def rlength(self) -> int:
        """Total length including trailing whitespace in bytes.
        
        Returns:
            Total byte length including whitespace
        """
        ...
    
    @property
    def posid(self) -> int:
        """Part-of-speech ID from MeCab.
        
        Returns:
            Numeric POS identifier
        """
        ...
    
    @property
    def char_type(self) -> int:
        """Character type classification from MeCab.
        
        Returns:
            Numeric character type code
        """
        ...
    
    @property
    def stat(self) -> int:
        """Node status from MeCab.
        
        Returns:
            Status code: 0=normal, 1=unknown, 2=BOS (beginning of sentence), 3=EOS (end of sentence)
        """
        ...
    
    @property
    def is_unk(self) -> bool:
        """Whether the token is unknown to the dictionary.
        
        Returns:
            True if the token was not found in the dictionary
        """
        ...
    
    @property
    def white_space(self) -> str:
        """Whitespace characters following this token.
        
        Returns:
            String containing trailing whitespace
        """
        ...
    
    @white_space.setter
    def white_space(self, value: str) -> None:
        """Set the whitespace following this token.
        
        Args:
            value: Whitespace string
        """
        ...
    
    def __repr__(self) -> str:
        """String representation of the node.
        
        Returns:
            Surface form or special markers for BOS/EOS nodes
        """
        ...

UnidicNode Class

UniDic-specific node class that extends Node with additional convenience methods for UniDic dictionary features.

class UnidicNode(Node):
    @property
    def pos(self) -> str:
        """Four-field part-of-speech value formatted as comma-separated string.
        
        Returns:
            POS string in format "pos1,pos2,pos3,pos4"
        """
        ...

UniDic Feature Structures

Named tuple structures providing structured access to UniDic dictionary features across different schema versions.

UnidicFeatures17 = NamedTuple('UnidicFeatures17', [
    ('pos1', str),      # Major part-of-speech category
    ('pos2', str),      # Middle part-of-speech category  
    ('pos3', str),      # Minor part-of-speech category
    ('pos4', str),      # Sub part-of-speech category
    ('cType', str),     # Conjugation type
    ('cForm', str),     # Conjugation form
    ('lForm', str),     # Lemma reading form
    ('lemma', str),     # Lemma (dictionary form)
    ('orth', str),      # Orthographic form
    ('pron', str),      # Pronunciation
    ('orthBase', str),  # Orthographic base form
    ('pronBase', str),  # Pronunciation base form
    ('goshu', str),     # Word origin classification
    ('iType', str),     # Inflection type
    ('iForm', str),     # Inflection form
    ('fType', str),     # Form type
    ('fForm', str)      # Form form
])

UnidicFeatures26 = NamedTuple('UnidicFeatures26', [
    ('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
    ('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
    ('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
    ('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),
    ('kana', str),         # Kana representation
    ('kanaBase', str),     # Kana base form
    ('form', str),         # Form information
    ('formBase', str),     # Form base
    ('iConType', str),     # Initial connection type
    ('fConType', str),     # Final connection type
    ('aType', str),        # Accent type
    ('aConType', str),     # Accent connection type
    ('aModeType', str)     # Accent mode type
])

UnidicFeatures29 = NamedTuple('UnidicFeatures29', [
    ('pos1', str), ('pos2', str), ('pos3', str), ('pos4', str),
    ('cType', str), ('cForm', str), ('lForm', str), ('lemma', str),
    ('orth', str), ('pron', str), ('orthBase', str), ('pronBase', str),
    ('goshu', str), ('iType', str), ('iForm', str), ('fType', str), ('fForm', str),
    ('iConType', str), ('fConType', str), ('type', str), ('kana', str), ('kanaBase', str),
    ('form', str), ('formBase', str), ('aType', str), ('aConType', str),
    ('aModType', str),     # Accent modification type
    ('lid', str),          # Lexicon ID
    ('lemma_id', str)      # Lemma ID
])

Usage Examples

Basic Node Access

from fugashi import Tagger

tagger = Tagger()
text = "美しい花が咲いている。"
nodes = tagger(text)

for node in nodes:
    print(f"Surface: {node.surface}")
    print(f"Lemma: {node.feature.lemma}")
    print(f"POS: {node.pos}")
    print(f"Is unknown: {node.is_unk}")
    print(f"Length: {node.length}")
    print("---")

Feature Access by Schema

from fugashi import Tagger, UnidicFeatures17

tagger = Tagger()
text = "走っている"
nodes = tagger(text)

for node in nodes:
    feature = node.feature
    
    # Access structured features
    print(f"Surface: {node.surface}")
    print(f"POS1: {feature.pos1}")        # Major POS category
    print(f"POS2: {feature.pos2}")        # Middle POS category  
    print(f"Lemma: {feature.lemma}")      # Dictionary form
    print(f"Reading: {feature.pron}")     # Pronunciation
    print(f"Inflection: {feature.cType}") # Conjugation type
    
    # Handle schema differences
    if hasattr(feature, 'aType'):
        print(f"Accent: {feature.aType}")
    
    print("---")

Working with Unknown Words

from fugashi import Tagger

tagger = Tagger()
text = "日本語とmixedテキスト"  # Mixed Japanese and English
nodes = tagger(text)

for node in nodes:
    if node.is_unk:
        print(f"Unknown word: {node.surface}")
        print(f"Character type: {node.char_type}")
    else:
        print(f"Known word: {node.surface} -> {node.feature.lemma}")

Whitespace and Text Reconstruction

from fugashi import Tagger

tagger = Tagger()
text = "これは\tタブ文字を\n含む文章です。"
nodes = tagger(text)

# Reconstruct original text with whitespace
reconstructed = ""
for node in nodes:
    reconstructed += node.surface + node.white_space

print(f"Original: {repr(text)}")
print(f"Reconstructed: {repr(reconstructed)}")
print(f"Match: {text == reconstructed}")

# Access specific whitespace
for i, node in enumerate(nodes):
    if node.white_space:
        print(f"Node {i} ({node.surface}) followed by: {repr(node.white_space)}")

Raw Feature Analysis

from fugashi import Tagger

tagger = Tagger()
text = "複雑な文法情報"
nodes = tagger(text)

for node in nodes:
    print(f"Surface: {node.surface}")
    print(f"Raw features: {node.feature_raw}")
    print(f"Parsed features: {node.feature}")
    print(f"POS ID: {node.posid}")
    print(f"Node status: {node.stat}")
    print("---")

Node Status Handling

from fugashi import Tagger

tagger = Tagger()
text = "短い文。"
nodes = tagger.parseToNodeList(text)

# Note: BOS/EOS nodes are typically filtered out in parseToNodeList
# but are present in the raw MeCab node chain
for node in nodes:
    status_map = {0: "Normal", 1: "Unknown", 2: "BOS", 3: "EOS"}
    print(f"{node.surface} (status: {status_map.get(node.stat, 'Other')})")

Install with Tessl CLI