tessl/pypi-natsort

Simple yet flexible natural sorting in Python that enables developers to sort strings containing numbers in a natural, human-expected order rather than lexicographical order.

—

Pending

Overview

Eval results

Files

Utilities and Text Processing

Name: tessl/pypi-natsort
Author: tessl

Utility functions for text processing, decoding, regular expression selection, and command-line interface functionality. These functions provide supporting capabilities for the core natsort functionality.

Capabilities

Text Decoding Functions

Functions for handling text encoding and decoding, particularly useful when working with mixed string and bytes data.

def decoder(encoding):
    """
    Return a function that can be used to decode bytes to unicode.

    Parameters:
    - encoding: str - The codec to use for decoding (must be valid unicode codec)

    Returns:
    Callable - A function that decodes bytes using the supplied codec

    Examples:
    >>> decode_utf8 = decoder('utf8')
    >>> decode_utf8(b'hello') == 'hello'
    True
    >>> decode_utf8('already string') == 'already string' 
    True
    """

def as_ascii(s):
    """
    Function to decode an input with the ASCII codec, or return as-is.

    Parameters:
    - s: object - Input to potentially decode

    Returns:
    object - Decoded string if input was bytes, otherwise input unchanged

    Examples:
    >>> as_ascii(b'hello')
    'hello'
    >>> as_ascii('hello')
    'hello'
    >>> as_ascii(123)
    123
    """

def as_utf8(s):
    """
    Function to decode an input with the UTF-8 codec, or return as-is.

    Parameters:
    - s: object - Input to potentially decode

    Returns:
    object - Decoded string if input was bytes, otherwise input unchanged

    Examples:
    >>> as_utf8(b'hello')
    'hello'
    >>> as_utf8('hello')  
    'hello'
    >>> as_utf8(123)
    123
    """

Function Composition

Utility for chaining multiple single-argument functions together.

def chain_functions(functions):
    """
    Chain a list of single-argument functions together and return.
    
    Functions are applied in list order, with the output of each function
    passed as input to the next function.

    Parameters:
    - functions: Iterable[Callable] - List of single-argument functions to chain

    Returns:
    Callable - A single argument function that applies all chained functions

    Examples:
    >>> funcs = [lambda x: x * 4, len, lambda x: x + 5]
    >>> chained = chain_functions(funcs)
    >>> chained('hey')  # 'hey' -> 'heyheyheyheyhey' -> 12 -> 17
    17
    """

Regular Expression Utilities

Function for selecting appropriate regular expressions for number matching based on algorithm settings.

def numeric_regex_chooser(alg):
    """
    Select an appropriate regex for the type of number of interest.

    Parameters:
    - alg: ns enum - Algorithm flags indicating the desired number type

    Returns:
    str - Regular expression string that matches the desired number type

    Examples:
    >>> numeric_regex_chooser(ns.INT)
    r'(\d+|[unicode_digits])'
    >>> numeric_regex_chooser(ns.FLOAT | ns.SIGNED)  
    r'([-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?|[unicode_numeric])'
    """

Command Line Interface

Main entry point for the natsort command-line tool.

def main(*arguments):
    """
    Performs a natural sort on entries given on the command-line.
    
    Entry point for the 'natsort' console script. Provides command-line
    access to natural sorting with various options for number types,
    filtering, and output formatting.

    Parameters:
    - *arguments: str - Command line arguments (uses sys.argv if not provided)

    Command Line Options:
    - -p, --paths: Interpret input as file paths
    - -f, --filter LOW HIGH: Keep entries with numbers in range
    - -F, --reverse-filter LOW HIGH: Exclude entries with numbers in range  
    - -e, --exclude NUMBER: Exclude entries containing specific number
    - -r, --reverse: Return results in reversed order
    - -t, --number-type {int,float,real}: Choose number interpretation
    - -s, --sign: Consider +/- as part of numbers
    - --noexp: Don't parse scientific notation
    - -l, --locale: Use locale-aware sorting

    Examples:
    # Sort lines from stdin
    $ echo -e "item10\nitem2\nitem1" | natsort
    
    # Sort file paths
    $ natsort --paths file10.txt file2.txt file1.txt
    
    # Sort with real numbers and filtering
    $ natsort --number-type real --filter -5 10 data.txt
    """

Usage Examples

Text Decoding with Mixed Data

from natsort import natsorted, decoder, as_utf8, as_ascii

# Mixed bytes and string data
mixed_data = [b'file10.txt', 'file2.txt', b'file1.txt', 'file20.txt']

# Method 1: Using decoder function
utf8_decoder = decoder('utf-8')
decoded_data = [utf8_decoder(item) for item in mixed_data]
sorted_decoded = natsorted(decoded_data)
print(f"Decoded and sorted: {sorted_decoded}")

# Method 2: Using as_utf8 directly in key function
sorted_mixed = natsorted(mixed_data, key=as_utf8)
print(f"Sorted with UTF-8 key: {sorted_mixed}")

# Method 3: Using as_ascii for ASCII-only data
ascii_mixed = [b'fileA.txt', 'fileB.txt', b'file1.txt']
sorted_ascii = natsorted(ascii_mixed, key=as_ascii)
print(f"Sorted with ASCII key: {sorted_ascii}")

Function Chaining for Complex Transformations

from natsort import natsorted, chain_functions
from pathlib import Path

# File paths that need complex preprocessing
file_paths = [
    '/home/user/Documents/Project_v1.10.txt',
    '/home/user/Documents/Project_v1.2.txt', 
    '/var/log/system_log_v2.1.txt',
    '/tmp/temp_file_v1.0.txt'
]

# Chain of transformations: Path -> filename -> lowercase -> remove extension
transform_chain = chain_functions([
    lambda x: Path(x).name,           # Get filename only
    lambda x: x.lower(),              # Convert to lowercase  
    lambda x: x.rsplit('.', 1)[0]     # Remove extension
])

# Sort using the chained transformation
sorted_files = natsorted(file_paths, key=transform_chain)
print("Sorted by transformed filename:")
for original, sorted_path in zip(file_paths, sorted_files):
    transformed = transform_chain(original)
    print(f"  {original} -> '{transformed}'")

Regular Expression Exploration

from natsort import numeric_regex_chooser, ns
import re

# Explore different regex patterns for number matching
algorithms = [
    ('INT (default)', ns.INT),
    ('FLOAT', ns.FLOAT), 
    ('SIGNED', ns.SIGNED),
    ('REAL (FLOAT|SIGNED)', ns.REAL),
    ('FLOAT without exponents', ns.FLOAT | ns.NOEXP)
]

test_string = "item-1.5e+3_version2.10_beta"

print("Regular expression patterns and matches:")
for name, alg in algorithms:
    pattern = numeric_regex_chooser(alg)
    matches = re.findall(pattern, test_string)
    print(f"{name:25}: {pattern}")
    print(f"{'':25}  Matches: {matches}")
    print()

Command Line Interface Usage

# Examples of using the natsort command-line interface

# Note: These would be run from the command line, not in Python

"""
# Basic usage - sort lines from a file
$ cat data.txt
item10
item2
item1  
item20

$ natsort data.txt
item1
item2
item10
item20

# Sort file paths 
$ natsort --paths folder/file10.txt folder/file2.txt folder/file1.txt
folder/file1.txt
folder/file2.txt  
folder/file10.txt

# Sort with real numbers and reverse order
$ echo -e "val-1.5\nval2.3\nval-0.8" | natsort --number-type real --reverse
val2.3
val-0.8
val-1.5

# Filter by numeric range
$ echo -e "item1\nitem25\nitem5\nitem30" | natsort --filter 1 10
item1
item5

# Exclude specific numbers
$ echo -e "test1\ntest2\ntest3\ntest10" | natsort --exclude 2
test1
test3  
test10

# Locale-aware sorting (results depend on system locale)
$ echo -e "café\nnaive\nresume" | natsort --locale
"""

# Programmatic access to CLI functionality
from natsort.__main__ import main
import sys
from io import StringIO

# Capture stdout to test CLI functionality
old_stdout = sys.stdout
sys.stdout = captured_output = StringIO()

try:
    # Simulate command line arguments  
    main('--number-type', 'real', '--reverse')
    # Note: This would normally read from stdin
except SystemExit:
    pass  # CLI exits normally

# Restore stdout
sys.stdout = old_stdout
output = captured_output.getvalue()

Advanced Text Processing Patterns

from natsort import natsorted, chain_functions, as_utf8
import unicodedata

# Complex text processing for international data
international_files = [
    'Résumé_v1.10.pdf',
    'résumé_v1.2.pdf',
    'NAÏVE_algorithm_v2.1.txt',
    'naïve_algorithm_v1.0.txt'
]

# Create a complex processing chain
def normalize_unicode(text):
    """Normalize unicode to standard form."""
    return unicodedata.normalize('NFD', text)

def remove_accents(text):
    """Remove accent characters.""" 
    return ''.join(c for c in text if unicodedata.category(c) != 'Mn')

# Chain transformations: decode -> normalize -> remove accents -> lowercase
text_processor = chain_functions([
    as_utf8,              # Ensure proper string type
    normalize_unicode,    # Normalize unicode representation
    remove_accents,       # Remove accent marks
    lambda x: x.lower()   # Convert to lowercase
])

# Sort using processed text as key
sorted_international = natsorted(international_files, key=text_processor)

print("Original -> Processed key:")
for filename in international_files:
    processed = text_processor(filename)
    print(f"  {filename} -> {processed}")

print(f"\nSorted order: {sorted_international}")

Integration with Data Processing Pipelines

from natsort import natsorted, chain_functions
import json
from pathlib import Path

# Simulate a data processing pipeline
def process_log_files(directory):
    """Process log files in natural order."""
    
    # Get all log files
    log_files = list(Path(directory).glob('*.log'))
    
    # Create key function for sorting: filename without extension, naturally
    filename_key = chain_functions([
        lambda x: x.stem,  # Get filename without extension
        str.lower         # Case-insensitive
    ])
    
    # Sort files naturally
    sorted_files = natsorted(log_files, key=filename_key)
    
    results = []
    for log_file in sorted_files:
        # Process each file (simulated)
        file_info = {
            'filename': log_file.name,
            'size': log_file.stat().st_size if log_file.exists() else 0,
            'processed': True
        }
        results.append(file_info)
    
    return results

# Example usage (would work with real directory)
# results = process_log_files('/var/log/')
# print(json.dumps(results, indent=2))

Install with Tessl CLI