CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-conllu

CoNLL-U Parser parses a CoNLL-U formatted string into a nested python dictionary

Pending
Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

SecuritybySnyk

Pending

The risk profile of this skill

Overview
Eval results
Files

CoNLL-U Parser

CoNLL-U Parser parses a CoNLL-U formatted string into a nested python dictionary. CoNLL-U is often the output of natural language processing tasks. This library provides comprehensive parsing, tree conversion, filtering, and serialization capabilities for CoNLL-U data with zero dependencies and full typing support.

Package Information

  • Package Name: conllu
  • Package Type: pypi
  • Language: Python
  • Installation: pip install conllu
  • Requirements: Python 3.8+
  • Dependencies: None (zero dependencies)

Core Imports

import conllu

Common patterns for parsing:

from conllu import parse, parse_tree, parse_incr, parse_tree_incr

Import data models:

from conllu import Token, TokenList, TokenTree, SentenceList, Metadata

Basic Usage

import conllu

# Parse CoNLL-U data into flat sentence list
data = """# text = The quick brown fox jumps
1	The	the	DET	DT	Definite=Def|PronType=Art	4	det	_	_
2	quick	quick	ADJ	JJ	Degree=Pos	4	amod	_	_
3	brown	brown	ADJ	JJ	Degree=Pos	4	amod	_	_
4	fox	fox	NOUN	NN	Number=Sing	0	root	_	_
"""

# Parse into flat list structure
sentences = conllu.parse(data)
print(f"Parsed {len(sentences)} sentences")
print(f"First sentence has {len(sentences[0])} tokens")

# Parse into tree structure
trees = conllu.parse_tree(data)
print(f"First tree root: {trees[0].token['form']}")

# Incremental parsing from file
with open('data.conllu', 'r') as f:
    for sentence in conllu.parse_incr(f):
        print(f"Sentence: {sentence.metadata.get('text', 'No text')}")
        
# Filter and serialize
filtered = sentences[0].filter(upos='NOUN')
conllu_output = filtered.serialize()

Capabilities

Core Parsing Functions

Primary parsing functions that convert CoNLL-U formatted strings into Python data structures. These functions support custom field definitions and custom parsing logic.

def parse(
    data: str, 
    fields: Optional[Sequence[str]] = None,
    field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
    metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> SentenceList:
    """
    Parse CoNLL-U formatted string into a SentenceList (flat list parsing).
    
    Args:
        data: CoNLL-U formatted string
        fields: Field names to use (defaults to DEFAULT_FIELDS)
        field_parsers: Custom parsers for specific fields
        metadata_parsers: Custom parsers for metadata lines
        
    Returns:
        SentenceList containing parsed sentences
    """

def parse_incr(
    in_file: TextIO, 
    fields: Optional[Sequence[str]] = None,
    field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
    metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> SentenceGenerator:
    """
    Incremental parsing from file/stream into SentenceGenerator for memory efficiency.
    
    Args:
        in_file: File-like object to read from
        fields: Field names to use (defaults to DEFAULT_FIELDS)
        field_parsers: Custom parsers for specific fields  
        metadata_parsers: Custom parsers for metadata lines
        
    Returns:
        SentenceGenerator for iterating over parsed sentences
    """

def parse_tree(data: str) -> List[TokenTree]:
    """
    Parse CoNLL-U formatted string into tree structure.
    
    Args:
        data: CoNLL-U formatted string
        
    Returns:
        List of TokenTree objects representing dependency trees
    """

def parse_tree_incr(in_file: TextIO) -> Iterator[TokenTree]:
    """
    Incremental tree parsing from file/stream.
    
    Args:
        in_file: File-like object to read from
        
    Returns:
        Iterator of TokenTree objects
    """

Data Models

Core data structures for representing CoNLL-U data with built-in methods for manipulation, filtering, and conversion.

class SentenceList(List[TokenList]):
    """
    List of sentences (TokenList objects) with metadata support.
    """
    def __init__(
        self, 
        sentences: Optional[Iterable[TokenList]] = None,
        metadata: Optional[Metadata] = None
    ): ...
    
    metadata: Metadata

class TokenList(List[Token]):
    """
    List of tokens representing a sentence with metadata and filtering capabilities.
    """
    def __init__(
        self,
        tokens: Optional[Iterable[Token]] = None,
        metadata: Optional[Metadata] = None,
        default_fields: Optional[Iterable[str]] = None
    ): ...
    
    metadata: Metadata
    default_fields: Optional[Iterable[str]]
    
    def to_tree(self) -> TokenTree:
        """Convert token list to tree structure based on head dependencies."""
        
    def filter(self, **kwargs: Any) -> TokenList:
        """Filter tokens based on field conditions using exact match or callable."""
        
    def serialize(self) -> str:
        """Serialize TokenList back to CoNLL-U format."""
        
    @staticmethod
    def head_to_token(sentence: TokenList) -> Dict[int, List[Token]]:
        """Create head-to-children mapping for tree construction."""

class TokenTree:
    """
    Tree representation of tokens with parent-child relationships.
    """
    def __init__(
        self, 
        token: Token, 
        children: List[TokenTree], 
        metadata: Optional[Metadata] = None
    ): ...
    
    token: Token
    children: List[TokenTree]
    metadata: Optional[Metadata]
    
    def to_list(self) -> TokenList:
        """Flatten tree back to token list."""
        
    def serialize(self) -> str:
        """Serialize tree to CoNLL-U format."""
        
    def print_tree(
        self, 
        depth: int = 0, 
        indent: int = 4,
        exclude_fields: Sequence[str] = DEFAULT_EXCLUDE_FIELDS
    ) -> None:
        """Print tree structure to console."""
        
    def set_metadata(self, metadata: Optional[Metadata]) -> None:
        """Set metadata for the tree."""

class Token(dict):
    """
    Dictionary representing a single token with field mappings and aliases.
    """
    MAPPING: Dict[str, str]  # Field name aliases (upos<->upostag, xpos<->xpostag)
    
    def get(self, key: str, default: Optional[Any] = None) -> Any:
        """Get field value with automatic alias resolution."""

class Metadata(dict):
    """
    Dictionary for storing sentence/document metadata from comment lines.
    """

class SentenceGenerator(Iterable[TokenList]):
    """
    Iterator for incremental sentence processing to handle large files efficiently.
    """
    def __init__(
        self,
        sentences: Iterator[TokenList],
        metadata: Optional[Metadata] = None
    ): ...
    
    sentences: Iterator[TokenList]
    metadata: Metadata

Parsing and Serialization Utilities

Low-level parsing functions and serialization utilities for custom parsing scenarios and advanced usage.

def parse_sentences(in_file: TextIO) -> Iterator[str]:
    """
    Split input stream into individual sentence strings.
    
    Args:
        in_file: File-like object to read from
        
    Returns:
        Iterator of sentence strings (raw CoNLL-U blocks)
    """

def parse_token_and_metadata(
    data: str, 
    fields: Optional[Sequence[str]] = None,
    field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
    metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> TokenList:
    """
    Parse single sentence data into TokenList with metadata.
    
    Args:
        data: Single sentence CoNLL-U data
        fields: Field names to use
        field_parsers: Custom field parsers
        metadata_parsers: Custom metadata parsers
        
    Returns:
        TokenList representing the sentence
    """

def serialize(tokenlist: TokenList) -> str:
    """
    Serialize TokenList to CoNLL-U format string.
    
    Args:
        tokenlist: TokenList to serialize
        
    Returns:
        CoNLL-U formatted string
    """

def serialize_field(field: Any) -> str:
    """
    Serialize individual field value to string representation.
    
    Args:
        field: Field value to serialize
        
    Returns:
        String representation suitable for CoNLL-U format
    """

Field Parsing Functions

Specialized functions for parsing individual CoNLL-U field types with proper validation and type conversion.

def parse_line(
    line: str,
    fields: Sequence[str], 
    field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None
) -> Token:
    """
    Parse single token line into Token object.
    
    Args:
        line: Single token line from CoNLL-U data
        fields: Field names for the columns
        field_parsers: Custom parsers for specific fields
        
    Returns:
        Token object representing the parsed line
    """

def parse_comment_line(
    line: str,
    metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> List[Tuple[str, Optional[str]]]:
    """
    Parse metadata comment line into key-value pairs.
    
    Args:
        line: Comment line starting with '#'
        metadata_parsers: Custom metadata parsers
        
    Returns:
        List of (key, value) tuples from the comment
    """

def parse_int_value(value: str) -> Optional[int]:
    """
    Parse integer field values, handling '_' as None.
    
    Args:
        value: String value to parse
        
    Returns:
        Parsed integer or None for '_'
    """

def parse_id_value(value: str) -> Optional[Union[int, Tuple[int, str, int]]]:
    """
    Parse ID field supporting single IDs, ranges, and decimal IDs.
    
    Args:
        value: ID field value
        
    Returns:
        Parsed ID as int, tuple for ranges/decimals, or None
    """

def parse_dict_value(value: str) -> Optional[Dict[str, Optional[str]]]:
    """
    Parse feature dictionaries from pipe-separated key=value pairs.
    
    Args:
        value: Feature string (e.g., "Case=Nom|Number=Sing")
        
    Returns:
        Dictionary of features or None for '_'
    """

def parse_nullable_value(value: str) -> Optional[str]:
    """
    Parse nullable string values, converting '_' to None.
    
    Args:
        value: String value to parse
        
    Returns:
        String value or None for empty/'_' values
    """

def parse_paired_list_value(value: str) -> Union[Optional[str], List[Tuple[str, Optional[Union[int, Tuple[int, str, int]]]]]]:
    """
    Parse dependency relations from dependency field values.
    
    Args:
        value: Dependency field value (e.g., "4:nsubj|5:conj")
        
    Returns:
        List of (relation, head_id) tuples or None for '_'
    """

def parse_pair_value(value: str) -> Tuple[str, Optional[str]]:
    """
    Parse key=value pairs, splitting on the first '=' character.
    
    Args:
        value: String potentially containing key=value pair
        
    Returns:
        Tuple of (key, value) where value is None if no '=' found
    """

Utility Functions

Helper functions for advanced data manipulation and tree traversal.

def traverse_dict(obj: Mapping[str, T], query: str) -> Optional[T]:
    """
    Navigate nested dictionaries using '__' separated query strings.
    
    Args:
        obj: Dictionary-like object to traverse
        query: Query string with '__' separators (e.g., 'feats__Case')
        
    Returns:
        Value at query path or None if path doesn't exist
    """

Types

# Type aliases for function signatures
FieldParserType = Callable[[List[str], int], Any]
MetadataParserType = Callable[[str, Optional[str]], Any]
IdType = Union[int, Tuple[int, str, int]]

# Default field configuration
DEFAULT_FIELDS: Tuple[str, ...] = (
    'id', 'form', 'lemma', 'upos', 'xpos', 'feats', 
    'head', 'deprel', 'deps', 'misc'
)

DEFAULT_FIELD_PARSERS: Dict[str, FieldParserType] = {
    "id": parse_id_value,
    "xpos": parse_nullable_value,
    "feats": parse_dict_value,
    "head": parse_int_value,
    "deps": parse_paired_list_value,
    "misc": parse_dict_value,
}

DEFAULT_METADATA_PARSERS: Dict[str, MetadataParserType] = {
    "newpar": lambda key, value: (key, value),
    "newdoc": lambda key, value: (key, value),
}

DEFAULT_EXCLUDE_FIELDS: Tuple[str, ...] = (
    'id', 'deprel', 'xpos', 'feats', 'head', 'deps', 'misc'
)

Exceptions

class ParseException(Exception):
    """
    Exception raised for parsing errors in CoNLL-U data.
    
    Raised when:
    - Invalid line format (missing tabs/spaces)
    - Invalid field values
    - Tree construction failures
    - Invalid comment format
    """

Advanced Usage Examples

Custom Field Parsing

import conllu

# Define custom parser for a non-standard field
def parse_custom_field(line_parts, field_index):
    value = line_parts[field_index]
    if value == '_':
        return None
    return value.upper()  # Custom transformation

# Use custom parser
custom_parsers = {'misc': parse_custom_field}
sentences = conllu.parse(data, field_parsers=custom_parsers)

Filtering and Analysis

# Filter tokens by part-of-speech
nouns = sentence.filter(upos='NOUN')

# Filter using callable for complex conditions
def is_long_word(form):
    return len(form) > 5

long_words = sentence.filter(form=is_long_word)

# Navigate nested features
adjectives = sentence.filter(feats__Degree='Pos')

Tree Operations

# Convert to tree and traverse
tree = sentence.to_tree()
print(f"Root: {tree.token['form']}")

# Print tree structure
tree.print_tree(indent=2)

# Convert back to flat list
flat_sentence = tree.to_list()

Incremental Processing

# Process large files efficiently
with open('large_corpus.conllu', 'r') as f:
    for sentence in conllu.parse_incr(f):
        # Process each sentence individually
        words = [token['form'] for token in sentence]
        print(' '.join(words))
Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/conllu@6.0.x
Publish Source
CLI
Badge
tessl/pypi-conllu badge