tessl/pypi-datatable

Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support

—

Pending

Overview

Eval results

Files

File I/O Operations

Name: tessl/pypi-datatable
Author: tessl

High-performance reading and writing of various file formats with automatic type detection, memory-efficient processing, and support for large datasets.

Capabilities

Fast File Reading

High-performance CSV and text file reading with automatic type detection and parallel processing.

def fread(anysource=None, *, file=None, text=None, cmd=None, url=None,
          sep=None, dec='.', max_nrows=None, header=None, na_strings=None,
          verbose=False, fill=False, encoding=None, quotechar=None,
          skip_to_string=None, skip_to_line=None, skip_blank_lines=False,
          strip_whitespace=True, columns=None, nthreads=None, logger=None,
          multiple=None, **kwargs) -> Frame:
    """
    Read text/CSV files into a datatable Frame with high performance.
    
    Parameters:
    - anysource: File path, URL, text string, or file-like object
    - file: File path (alternative to anysource)
    - text: Text string to parse (alternative to anysource)
    - cmd: Shell command output to read (alternative to anysource)
    - url: URL to read from (alternative to anysource)
    - sep: Field separator character (auto-detected if None)
    - dec: Decimal point character (default '.')
    - max_nrows: Maximum number of rows to read
    - header: Whether first row contains headers (auto-detected if None)
    - na_strings: Additional strings to treat as missing values
    - verbose: Print progress information
    - fill: Fill incomplete rows with NAs
    - encoding: Text encoding (auto-detected if None)
    - quotechar: Quote character (auto-detected if None)
    - skip_to_string: Skip lines until this string is found
    - skip_to_line: Skip this number of lines at start
    - skip_blank_lines: Skip blank lines
    - strip_whitespace: Strip whitespace from string fields
    - columns: Select specific columns to read
    - nthreads: Number of threads to use (auto-detected if None)
    - logger: Custom logger for progress messages
    - multiple: How to handle multiple files
    
    Returns:
    Frame object containing the parsed data
    """

def iread(anysource=None, *, file=None, text=None, cmd=None, url=None,
          **kwargs):
    """
    Incremental reader that yields Frame chunks for large files.
    
    Parameters: Same as fread()
    
    Yields:
    Frame objects for each chunk of data
    """

File Writing

Write Frame data to various output formats with customizable formatting options.

# Frame method for CSV output
def to_csv(self, file=None, *, sep=',', na_rep='', header=True,
           quotechar='"', encoding='utf-8', verbose=False, **kwargs):
    """
    Write Frame to CSV file or return as string.
    
    Parameters:
    - file: Output file path (returns string if None)
    - sep: Field separator character
    - na_rep: String representation of missing values
    - header: Include column headers
    - quotechar: Quote character for strings
    - encoding: Text encoding for output file
    - verbose: Print progress information
    
    Returns:
    None if file specified, CSV string otherwise
    """

File Reading Examples

Basic CSV Reading

import datatable as dt

# Read from file path
DT = dt.fread("data.csv")

# Read with specific separator
DT = dt.fread("data.tsv", sep='\t')

# Read from URL
DT = dt.fread("https://example.com/data.csv")

# Read from compressed file
DT = dt.fread("data.csv.gz")

# Read only first 1000 rows
DT = dt.fread("large_data.csv", max_nrows=1000)

Advanced Reading Options

# Custom missing value strings
DT = dt.fread("data.csv", na_strings=['NULL', 'missing', ''])

# Skip header rows
DT = dt.fread("data.csv", skip_to_line=3)

# Skip to specific string
DT = dt.fread("data.csv", skip_to_string="START_DATA")

# Select specific columns
DT = dt.fread("data.csv", columns=['col1', 'col3', 'col5'])

# Control threading
DT = dt.fread("data.csv", nthreads=4)

# Verbose output
DT = dt.fread("data.csv", verbose=True)

Reading from Different Sources

# Read from string
csv_text = """A,B,C
1,x,1.1
2,y,2.2
3,z,3.3"""
DT = dt.fread(text=csv_text)

# Read from shell command
DT = dt.fread(cmd="curl https://example.com/data.csv")

# Read from file-like object
with open("data.csv", 'r') as f:
    DT = dt.fread(f)

# Read multiple files
DT = dt.fread(["file1.csv", "file2.csv"], multiple='rbind')

Incremental Reading

# Process large files in chunks
for chunk in dt.iread("very_large_file.csv", max_nrows=10000):
    # Process each chunk
    processed = chunk[:, dt.sum(f.value)]
    # Save or accumulate results
    
# Memory-efficient aggregation of large files
total = 0
count = 0
for chunk in dt.iread("huge_data.csv"):
    total += chunk[:, dt.sum(f.amount)][0, 0]
    count += chunk.nrows

average = total / count

File Format Detection

# Automatic format detection
DT = dt.fread("data.txt")      # Auto-detects separator
DT = dt.fread("data.psv")      # Pipe-separated values
DT = dt.fread("fixed_width.txt")  # Fixed-width format

# Override auto-detection
DT = dt.fread("data.txt", sep='|')
DT = dt.fread("data.csv", header=False)

File Writing Examples

Basic CSV Writing

import datatable as dt

DT = dt.Frame({
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [1.1, 2.2, 3.3, 4.4, 5.5]
})

# Write to file
DT.to_csv("output.csv")

# Write with custom separator
DT.to_csv("output.tsv", sep='\t')

# Write without header
DT.to_csv("output.csv", header=False)

# Custom missing value representation
DT.to_csv("output.csv", na_rep='NULL')

String Output

# Get CSV as string
csv_string = DT.to_csv()
print(csv_string)

# Custom formatting
csv_string = DT.to_csv(sep='|', quotechar="'")

Large File Writing

# Write large frames efficiently
large_DT = dt.Frame({'x': range(10000000)})
large_DT.to_csv("large_output.csv", verbose=True)

# Append to existing file (using Python file handling)
with open("growing_file.csv", 'a') as f:
    chunk_csv = chunk.to_csv(header=False)
    f.write(chunk_csv)

Performance Considerations

Reading Performance Tips

# Use multiple threads for large files
DT = dt.fread("big_file.csv", nthreads=8)

# Pre-specify column types for faster parsing
DT = dt.fread("data.csv", columns={'A': dt.int32, 'B': dt.str32})

# Limit columns for faster reading
DT = dt.fread("wide_data.csv", columns=['col1', 'col3', 'col7'])

# Use incremental reading for very large files
for chunk in dt.iread("massive_file.csv", max_nrows=100000):
    # Process incrementally to avoid memory issues
    pass

Memory Efficiency

# Memory-mapped reading for out-of-core processing
DT = dt.fread("huge_file.csv")  # Uses memory mapping automatically

# Process data in chunks to control memory usage
def process_large_file(filename):
    results = []
    for chunk in dt.iread(filename, max_nrows=50000):
        result = chunk[:, dt.sum(f.value), dt.by(f.category)]
        results.append(result)
    return dt.rbind(*results)

Error Handling

try:
    DT = dt.fread("might_not_exist.csv")
except dt.exceptions.IOError as e:
    print(f"File reading failed: {e}")

try:
    DT = dt.fread("malformed.csv")
except dt.exceptions.ValueError as e:
    print(f"Parsing error: {e}")

# Graceful handling of missing files
import os
if os.path.exists("data.csv"):
    DT = dt.fread("data.csv")
else:
    DT = dt.Frame()  # Empty frame as fallback

Install with Tessl CLI