CoNLL-U Parser parses a CoNLL-U formatted string into a nested python dictionary
npx @tessl/cli install tessl/pypi-conllu@6.0.0CoNLL-U Parser parses a CoNLL-U formatted string into a nested python dictionary. CoNLL-U is often the output of natural language processing tasks. This library provides comprehensive parsing, tree conversion, filtering, and serialization capabilities for CoNLL-U data with zero dependencies and full typing support.
pip install conlluimport conlluCommon patterns for parsing:
from conllu import parse, parse_tree, parse_incr, parse_tree_incrImport data models:
from conllu import Token, TokenList, TokenTree, SentenceList, Metadataimport conllu
# Parse CoNLL-U data into flat sentence list
data = """# text = The quick brown fox jumps
1 The the DET DT Definite=Def|PronType=Art 4 det _ _
2 quick quick ADJ JJ Degree=Pos 4 amod _ _
3 brown brown ADJ JJ Degree=Pos 4 amod _ _
4 fox fox NOUN NN Number=Sing 0 root _ _
"""
# Parse into flat list structure
sentences = conllu.parse(data)
print(f"Parsed {len(sentences)} sentences")
print(f"First sentence has {len(sentences[0])} tokens")
# Parse into tree structure
trees = conllu.parse_tree(data)
print(f"First tree root: {trees[0].token['form']}")
# Incremental parsing from file
with open('data.conllu', 'r') as f:
for sentence in conllu.parse_incr(f):
print(f"Sentence: {sentence.metadata.get('text', 'No text')}")
# Filter and serialize
filtered = sentences[0].filter(upos='NOUN')
conllu_output = filtered.serialize()Primary parsing functions that convert CoNLL-U formatted strings into Python data structures. These functions support custom field definitions and custom parsing logic.
def parse(
data: str,
fields: Optional[Sequence[str]] = None,
field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> SentenceList:
"""
Parse CoNLL-U formatted string into a SentenceList (flat list parsing).
Args:
data: CoNLL-U formatted string
fields: Field names to use (defaults to DEFAULT_FIELDS)
field_parsers: Custom parsers for specific fields
metadata_parsers: Custom parsers for metadata lines
Returns:
SentenceList containing parsed sentences
"""
def parse_incr(
in_file: TextIO,
fields: Optional[Sequence[str]] = None,
field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> SentenceGenerator:
"""
Incremental parsing from file/stream into SentenceGenerator for memory efficiency.
Args:
in_file: File-like object to read from
fields: Field names to use (defaults to DEFAULT_FIELDS)
field_parsers: Custom parsers for specific fields
metadata_parsers: Custom parsers for metadata lines
Returns:
SentenceGenerator for iterating over parsed sentences
"""
def parse_tree(data: str) -> List[TokenTree]:
"""
Parse CoNLL-U formatted string into tree structure.
Args:
data: CoNLL-U formatted string
Returns:
List of TokenTree objects representing dependency trees
"""
def parse_tree_incr(in_file: TextIO) -> Iterator[TokenTree]:
"""
Incremental tree parsing from file/stream.
Args:
in_file: File-like object to read from
Returns:
Iterator of TokenTree objects
"""Core data structures for representing CoNLL-U data with built-in methods for manipulation, filtering, and conversion.
class SentenceList(List[TokenList]):
"""
List of sentences (TokenList objects) with metadata support.
"""
def __init__(
self,
sentences: Optional[Iterable[TokenList]] = None,
metadata: Optional[Metadata] = None
): ...
metadata: Metadata
class TokenList(List[Token]):
"""
List of tokens representing a sentence with metadata and filtering capabilities.
"""
def __init__(
self,
tokens: Optional[Iterable[Token]] = None,
metadata: Optional[Metadata] = None,
default_fields: Optional[Iterable[str]] = None
): ...
metadata: Metadata
default_fields: Optional[Iterable[str]]
def to_tree(self) -> TokenTree:
"""Convert token list to tree structure based on head dependencies."""
def filter(self, **kwargs: Any) -> TokenList:
"""Filter tokens based on field conditions using exact match or callable."""
def serialize(self) -> str:
"""Serialize TokenList back to CoNLL-U format."""
@staticmethod
def head_to_token(sentence: TokenList) -> Dict[int, List[Token]]:
"""Create head-to-children mapping for tree construction."""
class TokenTree:
"""
Tree representation of tokens with parent-child relationships.
"""
def __init__(
self,
token: Token,
children: List[TokenTree],
metadata: Optional[Metadata] = None
): ...
token: Token
children: List[TokenTree]
metadata: Optional[Metadata]
def to_list(self) -> TokenList:
"""Flatten tree back to token list."""
def serialize(self) -> str:
"""Serialize tree to CoNLL-U format."""
def print_tree(
self,
depth: int = 0,
indent: int = 4,
exclude_fields: Sequence[str] = DEFAULT_EXCLUDE_FIELDS
) -> None:
"""Print tree structure to console."""
def set_metadata(self, metadata: Optional[Metadata]) -> None:
"""Set metadata for the tree."""
class Token(dict):
"""
Dictionary representing a single token with field mappings and aliases.
"""
MAPPING: Dict[str, str] # Field name aliases (upos<->upostag, xpos<->xpostag)
def get(self, key: str, default: Optional[Any] = None) -> Any:
"""Get field value with automatic alias resolution."""
class Metadata(dict):
"""
Dictionary for storing sentence/document metadata from comment lines.
"""
class SentenceGenerator(Iterable[TokenList]):
"""
Iterator for incremental sentence processing to handle large files efficiently.
"""
def __init__(
self,
sentences: Iterator[TokenList],
metadata: Optional[Metadata] = None
): ...
sentences: Iterator[TokenList]
metadata: MetadataLow-level parsing functions and serialization utilities for custom parsing scenarios and advanced usage.
def parse_sentences(in_file: TextIO) -> Iterator[str]:
"""
Split input stream into individual sentence strings.
Args:
in_file: File-like object to read from
Returns:
Iterator of sentence strings (raw CoNLL-U blocks)
"""
def parse_token_and_metadata(
data: str,
fields: Optional[Sequence[str]] = None,
field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None,
metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> TokenList:
"""
Parse single sentence data into TokenList with metadata.
Args:
data: Single sentence CoNLL-U data
fields: Field names to use
field_parsers: Custom field parsers
metadata_parsers: Custom metadata parsers
Returns:
TokenList representing the sentence
"""
def serialize(tokenlist: TokenList) -> str:
"""
Serialize TokenList to CoNLL-U format string.
Args:
tokenlist: TokenList to serialize
Returns:
CoNLL-U formatted string
"""
def serialize_field(field: Any) -> str:
"""
Serialize individual field value to string representation.
Args:
field: Field value to serialize
Returns:
String representation suitable for CoNLL-U format
"""Specialized functions for parsing individual CoNLL-U field types with proper validation and type conversion.
def parse_line(
line: str,
fields: Sequence[str],
field_parsers: Optional[Dict[str, Callable[[List[str], int], Any]]] = None
) -> Token:
"""
Parse single token line into Token object.
Args:
line: Single token line from CoNLL-U data
fields: Field names for the columns
field_parsers: Custom parsers for specific fields
Returns:
Token object representing the parsed line
"""
def parse_comment_line(
line: str,
metadata_parsers: Optional[Dict[str, Callable[[str, Optional[str]], Any]]] = None
) -> List[Tuple[str, Optional[str]]]:
"""
Parse metadata comment line into key-value pairs.
Args:
line: Comment line starting with '#'
metadata_parsers: Custom metadata parsers
Returns:
List of (key, value) tuples from the comment
"""
def parse_int_value(value: str) -> Optional[int]:
"""
Parse integer field values, handling '_' as None.
Args:
value: String value to parse
Returns:
Parsed integer or None for '_'
"""
def parse_id_value(value: str) -> Optional[Union[int, Tuple[int, str, int]]]:
"""
Parse ID field supporting single IDs, ranges, and decimal IDs.
Args:
value: ID field value
Returns:
Parsed ID as int, tuple for ranges/decimals, or None
"""
def parse_dict_value(value: str) -> Optional[Dict[str, Optional[str]]]:
"""
Parse feature dictionaries from pipe-separated key=value pairs.
Args:
value: Feature string (e.g., "Case=Nom|Number=Sing")
Returns:
Dictionary of features or None for '_'
"""
def parse_nullable_value(value: str) -> Optional[str]:
"""
Parse nullable string values, converting '_' to None.
Args:
value: String value to parse
Returns:
String value or None for empty/'_' values
"""
def parse_paired_list_value(value: str) -> Union[Optional[str], List[Tuple[str, Optional[Union[int, Tuple[int, str, int]]]]]]:
"""
Parse dependency relations from dependency field values.
Args:
value: Dependency field value (e.g., "4:nsubj|5:conj")
Returns:
List of (relation, head_id) tuples or None for '_'
"""
def parse_pair_value(value: str) -> Tuple[str, Optional[str]]:
"""
Parse key=value pairs, splitting on the first '=' character.
Args:
value: String potentially containing key=value pair
Returns:
Tuple of (key, value) where value is None if no '=' found
"""Helper functions for advanced data manipulation and tree traversal.
def traverse_dict(obj: Mapping[str, T], query: str) -> Optional[T]:
"""
Navigate nested dictionaries using '__' separated query strings.
Args:
obj: Dictionary-like object to traverse
query: Query string with '__' separators (e.g., 'feats__Case')
Returns:
Value at query path or None if path doesn't exist
"""# Type aliases for function signatures
FieldParserType = Callable[[List[str], int], Any]
MetadataParserType = Callable[[str, Optional[str]], Any]
IdType = Union[int, Tuple[int, str, int]]
# Default field configuration
DEFAULT_FIELDS: Tuple[str, ...] = (
'id', 'form', 'lemma', 'upos', 'xpos', 'feats',
'head', 'deprel', 'deps', 'misc'
)
DEFAULT_FIELD_PARSERS: Dict[str, FieldParserType] = {
"id": parse_id_value,
"xpos": parse_nullable_value,
"feats": parse_dict_value,
"head": parse_int_value,
"deps": parse_paired_list_value,
"misc": parse_dict_value,
}
DEFAULT_METADATA_PARSERS: Dict[str, MetadataParserType] = {
"newpar": lambda key, value: (key, value),
"newdoc": lambda key, value: (key, value),
}
DEFAULT_EXCLUDE_FIELDS: Tuple[str, ...] = (
'id', 'deprel', 'xpos', 'feats', 'head', 'deps', 'misc'
)class ParseException(Exception):
"""
Exception raised for parsing errors in CoNLL-U data.
Raised when:
- Invalid line format (missing tabs/spaces)
- Invalid field values
- Tree construction failures
- Invalid comment format
"""import conllu
# Define custom parser for a non-standard field
def parse_custom_field(line_parts, field_index):
value = line_parts[field_index]
if value == '_':
return None
return value.upper() # Custom transformation
# Use custom parser
custom_parsers = {'misc': parse_custom_field}
sentences = conllu.parse(data, field_parsers=custom_parsers)# Filter tokens by part-of-speech
nouns = sentence.filter(upos='NOUN')
# Filter using callable for complex conditions
def is_long_word(form):
return len(form) > 5
long_words = sentence.filter(form=is_long_word)
# Navigate nested features
adjectives = sentence.filter(feats__Degree='Pos')# Convert to tree and traverse
tree = sentence.to_tree()
print(f"Root: {tree.token['form']}")
# Print tree structure
tree.print_tree(indent=2)
# Convert back to flat list
flat_sentence = tree.to_list()# Process large files efficiently
with open('large_corpus.conllu', 'r') as f:
for sentence in conllu.parse_incr(f):
# Process each sentence individually
words = [token['form'] for token in sentence]
print(' '.join(words))