or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

index.md
tile.json

tessl/pypi-conllu

CoNLL-U Parser parses a CoNLL-U formatted string into a nested python dictionary

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/conllu@6.0.x

To install, run

npx @tessl/cli install tessl/pypi-conllu@6.0.0

index.mddocs/

CoNLL-U Parser

CoNLL-U Parser parses a CoNLL-U formatted string into a nested python dictionary. CoNLL-U is often the output of natural language processing tasks. This library provides comprehensive parsing, tree conversion, filtering, and serialization capabilities for CoNLL-U data with zero dependencies and full typing support.

Package Information

  • Package Name: conllu
  • Package Type: pypi
  • Language: Python
  • Installation: pip install conllu
  • Requirements: Python 3.8+
  • Dependencies: None (zero dependencies)

Core Imports

import conllu

Common patterns for parsing:

from conllu import parse, parse_tree, parse_incr, parse_tree_incr

Import data models:

from conllu import Token, TokenList, TokenTree, SentenceList, Metadata

Basic Usage

import conllu

# Parse CoNLL-U data into flat sentence list
data = """# text = The quick brown fox jumps
1	The	the	DET	DT	Definite=Def|PronType=Art	4	det	_	_
2	quick	quick	ADJ	JJ	Degree=Pos	4	amod	_	_
3	brown	brown	ADJ	JJ	Degree=Pos	4	amod	_	_
4	fox	fox	NOUN	NN	Number=Sing	0	root	_	_
"""

# Parse into flat list structure
sentences = conllu.parse(data)
print(f"Parsed {len(sentences)} sentences")
print(f"First sentence has {len(sentences[0])} tokens")

# Parse into tree structure
trees = conllu.parse_tree(data)
print(f"First tree root: {trees[0].token['form']}")

# Incremental parsing from file
with open('data.conllu', 'r') as f:
    for sentence in conllu.parse_incr(f):
        print(f"Sentence: {sentence.metadata.get('text', 'No text')}")
        
# Filter and serialize
filtered = sentences[0].filter(upos='NOUN')
conllu_output = filtered.serialize()

Capabilities

Core Parsing Functions

Primary parsing functions that convert CoNLL-U formatted strings into Python data structures. These functions support custom field definitions and custom parsing logic.

def parse(
    data: str, 
    fields: Optional[Sequence[str]] = None,
    field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
    metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> SentenceList:
    """
    Parse CoNLL-U formatted string into a SentenceList (flat list parsing).
    
    Args:
        data: CoNLL-U formatted string
        fields: Field names to use (defaults to DEFAULT_FIELDS)
        field_parsers: Custom parsers for specific fields
        metadata_parsers: Custom parsers for metadata lines
        
    Returns:
        SentenceList containing parsed sentences
    """

def parse_incr(
    in_file: TextIO, 
    fields: Optional[Sequence[str]] = None,
    field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
    metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> SentenceGenerator:
    """
    Incremental parsing from file/stream into SentenceGenerator for memory efficiency.
    
    Args:
        in_file: File-like object to read from
        fields: Field names to use (defaults to DEFAULT_FIELDS)
        field_parsers: Custom parsers for specific fields  
        metadata_parsers: Custom parsers for metadata lines
        
    Returns:
        SentenceGenerator for iterating over parsed sentences
    """

def parse_tree(data: str) -> List[TokenTree]:
    """
    Parse CoNLL-U formatted string into tree structure.
    
    Args:
        data: CoNLL-U formatted string
        
    Returns:
        List of TokenTree objects representing dependency trees
    """

def parse_tree_incr(in_file: TextIO) -> Iterator[TokenTree]:
    """
    Incremental tree parsing from file/stream.
    
    Args:
        in_file: File-like object to read from
        
    Returns:
        Iterator of TokenTree objects
    """

Data Models

Core data structures for representing CoNLL-U data with built-in methods for manipulation, filtering, and conversion.

class SentenceList(List[TokenList]):
    """
    List of sentences (TokenList objects) with metadata support.
    """
    def __init__(
        self, 
        sentences: Optional[Iterable[TokenList]] = None,
        metadata: Optional[Metadata] = None
    ): ...
    
    metadata: Metadata

class TokenList(List[Token]):
    """
    List of tokens representing a sentence with metadata and filtering capabilities.
    """
    def __init__(
        self,
        tokens: Optional[Iterable[Token]] = None,
        metadata: Optional[Metadata] = None,
        default_fields: Optional[Iterable[str]] = None
    ): ...
    
    metadata: Metadata
    default_fields: Optional[Iterable[str]]
    
    def to_tree(self) -> TokenTree:
        """Convert token list to tree structure based on head dependencies."""
        
    def filter(self, **kwargs: Any) -> TokenList:
        """Filter tokens based on field conditions using exact match or callable."""
        
    def serialize(self) -> str:
        """Serialize TokenList back to CoNLL-U format."""
        
    @staticmethod
    def head_to_token(sentence: TokenList) -> Dict[int, List[Token]]:
        """Create head-to-children mapping for tree construction."""

class TokenTree:
    """
    Tree representation of tokens with parent-child relationships.
    """
    def __init__(
        self, 
        token: Token, 
        children: List[TokenTree], 
        metadata: Optional[Metadata] = None
    ): ...
    
    token: Token
    children: List[TokenTree]
    metadata: Optional[Metadata]
    
    def to_list(self) -> TokenList:
        """Flatten tree back to token list."""
        
    def serialize(self) -> str:
        """Serialize tree to CoNLL-U format."""
        
    def print_tree(
        self, 
        depth: int = 0, 
        indent: int = 4,
        exclude_fields: Sequence[str] = DEFAULT_EXCLUDE_FIELDS
    ) -> None:
        """Print tree structure to console."""
        
    def set_metadata(self, metadata: Optional[Metadata]) -> None:
        """Set metadata for the tree."""

class Token(dict):
    """
    Dictionary representing a single token with field mappings and aliases.
    """
    MAPPING: Dict[str, str]  # Field name aliases (upos<->upostag, xpos<->xpostag)
    
    def get(self, key: str, default: Optional[Any] = None) -> Any:
        """Get field value with automatic alias resolution."""

class Metadata(dict):
    """
    Dictionary for storing sentence/document metadata from comment lines.
    """

class SentenceGenerator(Iterable[TokenList]):
    """
    Iterator for incremental sentence processing to handle large files efficiently.
    """
    def __init__(
        self,
        sentences: Iterator[TokenList],
        metadata: Optional[Metadata] = None
    ): ...
    
    sentences: Iterator[TokenList]
    metadata: Metadata

Parsing and Serialization Utilities

Low-level parsing functions and serialization utilities for custom parsing scenarios and advanced usage.

def parse_sentences(in_file: TextIO) -> Iterator[str]:
    """
    Split input stream into individual sentence strings.
    
    Args:
        in_file: File-like object to read from
        
    Returns:
        Iterator of sentence strings (raw CoNLL-U blocks)
    """

def parse_token_and_metadata(
    data: str, 
    fields: Optional[Sequence[str]] = None,
    field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
    metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> TokenList:
    """
    Parse single sentence data into TokenList with metadata.
    
    Args:
        data: Single sentence CoNLL-U data
        fields: Field names to use
        field_parsers: Custom field parsers
        metadata_parsers: Custom metadata parsers
        
    Returns:
        TokenList representing the sentence
    """

def serialize(tokenlist: TokenList) -> str:
    """
    Serialize TokenList to CoNLL-U format string.
    
    Args:
        tokenlist: TokenList to serialize
        
    Returns:
        CoNLL-U formatted string
    """

def serialize_field(field: Any) -> str:
    """
    Serialize individual field value to string representation.
    
    Args:
        field: Field value to serialize
        
    Returns:
        String representation suitable for CoNLL-U format
    """

Field Parsing Functions

Specialized functions for parsing individual CoNLL-U field types with proper validation and type conversion.

def parse_line(
    line: str,
    fields: Sequence[str], 
    field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None
) -> Token:
    """
    Parse single token line into Token object.
    
    Args:
        line: Single token line from CoNLL-U data
        fields: Field names for the columns
        field_parsers: Custom parsers for specific fields
        
    Returns:
        Token object representing the parsed line
    """

def parse_comment_line(
    line: str,
    metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> List[Tuple[str, Optional[str]]]:
    """
    Parse metadata comment line into key-value pairs.
    
    Args:
        line: Comment line starting with '#'
        metadata_parsers: Custom metadata parsers
        
    Returns:
        List of (key, value) tuples from the comment
    """

def parse_int_value(value: str) -> Optional[int]:
    """
    Parse integer field values, handling '_' as None.
    
    Args:
        value: String value to parse
        
    Returns:
        Parsed integer or None for '_'
    """

def parse_id_value(value: str) -> Optional[Union[int, Tuple[int, str, int]]]:
    """
    Parse ID field supporting single IDs, ranges, and decimal IDs.
    
    Args:
        value: ID field value
        
    Returns:
        Parsed ID as int, tuple for ranges/decimals, or None
    """

def parse_dict_value(value: str) -> Optional[Dict[str, Optional[str]]]:
    """
    Parse feature dictionaries from pipe-separated key=value pairs.
    
    Args:
        value: Feature string (e.g., "Case=Nom|Number=Sing")
        
    Returns:
        Dictionary of features or None for '_'
    """

def parse_nullable_value(value: str) -> Optional[str]:
    """
    Parse nullable string values, converting '_' to None.
    
    Args:
        value: String value to parse
        
    Returns:
        String value or None for empty/'_' values
    """

def parse_paired_list_value(value: str) -> Union[Optional[str], List[Tuple[str, Optional[Union[int, Tuple[int, str, int]]]]]]:
    """
    Parse dependency relations from dependency field values.
    
    Args:
        value: Dependency field value (e.g., "4:nsubj|5:conj")
        
    Returns:
        List of (relation, head_id) tuples or None for '_'
    """

def parse_pair_value(value: str) -> Tuple[str, Optional[str]]:
    """
    Parse key=value pairs, splitting on the first '=' character.
    
    Args:
        value: String potentially containing key=value pair
        
    Returns:
        Tuple of (key, value) where value is None if no '=' found
    """

Utility Functions

Helper functions for advanced data manipulation and tree traversal.

def traverse_dict(obj: Mapping[str, T], query: str) -> Optional[T]:
    """
    Navigate nested dictionaries using '__' separated query strings.
    
    Args:
        obj: Dictionary-like object to traverse
        query: Query string with '__' separators (e.g., 'feats__Case')
        
    Returns:
        Value at query path or None if path doesn't exist
    """

Types

# Type aliases for function signatures
FieldParserType = Callable[[List[str], int], Any]
MetadataParserType = Callable[[str, Optional[str]], Any]
IdType = Union[int, Tuple[int, str, int]]

# Default field configuration
DEFAULT_FIELDS: Tuple[str, ...] = (
    'id', 'form', 'lemma', 'upos', 'xpos', 'feats', 
    'head', 'deprel', 'deps', 'misc'
)

DEFAULT_FIELD_PARSERS: Dict[str, FieldParserType] = {
    "id": parse_id_value,
    "xpos": parse_nullable_value,
    "feats": parse_dict_value,
    "head": parse_int_value,
    "deps": parse_paired_list_value,
    "misc": parse_dict_value,
}

DEFAULT_METADATA_PARSERS: Dict[str, MetadataParserType] = {
    "newpar": lambda key, value: (key, value),
    "newdoc": lambda key, value: (key, value),
}

DEFAULT_EXCLUDE_FIELDS: Tuple[str, ...] = (
    'id', 'deprel', 'xpos', 'feats', 'head', 'deps', 'misc'
)

Exceptions

class ParseException(Exception):
    """
    Exception raised for parsing errors in CoNLL-U data.
    
    Raised when:
    - Invalid line format (missing tabs/spaces)
    - Invalid field values
    - Tree construction failures
    - Invalid comment format
    """

Advanced Usage Examples

Custom Field Parsing

import conllu

# Define custom parser for a non-standard field
def parse_custom_field(line_parts, field_index):
    value = line_parts[field_index]
    if value == '_':
        return None
    return value.upper()  # Custom transformation

# Use custom parser
custom_parsers = {'misc': parse_custom_field}
sentences = conllu.parse(data, field_parsers=custom_parsers)

Filtering and Analysis

# Filter tokens by part-of-speech
nouns = sentence.filter(upos='NOUN')

# Filter using callable for complex conditions
def is_long_word(form):
    return len(form) > 5

long_words = sentence.filter(form=is_long_word)

# Navigate nested features
adjectives = sentence.filter(feats__Degree='Pos')

Tree Operations

# Convert to tree and traverse
tree = sentence.to_tree()
print(f"Root: {tree.token['form']}")

# Print tree structure
tree.print_tree(indent=2)

# Convert back to flat list
flat_sentence = tree.to_list()

Incremental Processing

# Process large files efficiently
with open('large_corpus.conllu', 'r') as f:
    for sentence in conllu.parse_incr(f):
        # Process each sentence individually
        words = [token['form'] for token in sentence]
        print(' '.join(words))