Python package for manipulating 2-dimensional tabular data structures with emphasis on speed and big data support
—
High-performance reading and writing of various file formats with automatic type detection, memory-efficient processing, and support for large datasets.
High-performance CSV and text file reading with automatic type detection and parallel processing.
def fread(anysource=None, *, file=None, text=None, cmd=None, url=None,
sep=None, dec='.', max_nrows=None, header=None, na_strings=None,
verbose=False, fill=False, encoding=None, quotechar=None,
skip_to_string=None, skip_to_line=None, skip_blank_lines=False,
strip_whitespace=True, columns=None, nthreads=None, logger=None,
multiple=None, **kwargs) -> Frame:
"""
Read text/CSV files into a datatable Frame with high performance.
Parameters:
- anysource: File path, URL, text string, or file-like object
- file: File path (alternative to anysource)
- text: Text string to parse (alternative to anysource)
- cmd: Shell command output to read (alternative to anysource)
- url: URL to read from (alternative to anysource)
- sep: Field separator character (auto-detected if None)
- dec: Decimal point character (default '.')
- max_nrows: Maximum number of rows to read
- header: Whether first row contains headers (auto-detected if None)
- na_strings: Additional strings to treat as missing values
- verbose: Print progress information
- fill: Fill incomplete rows with NAs
- encoding: Text encoding (auto-detected if None)
- quotechar: Quote character (auto-detected if None)
- skip_to_string: Skip lines until this string is found
- skip_to_line: Skip this number of lines at start
- skip_blank_lines: Skip blank lines
- strip_whitespace: Strip whitespace from string fields
- columns: Select specific columns to read
- nthreads: Number of threads to use (auto-detected if None)
- logger: Custom logger for progress messages
- multiple: How to handle multiple files
Returns:
Frame object containing the parsed data
"""
def iread(anysource=None, *, file=None, text=None, cmd=None, url=None,
**kwargs):
"""
Incremental reader that yields Frame chunks for large files.
Parameters: Same as fread()
Yields:
Frame objects for each chunk of data
"""Write Frame data to various output formats with customizable formatting options.
# Frame method for CSV output
def to_csv(self, file=None, *, sep=',', na_rep='', header=True,
quotechar='"', encoding='utf-8', verbose=False, **kwargs):
"""
Write Frame to CSV file or return as string.
Parameters:
- file: Output file path (returns string if None)
- sep: Field separator character
- na_rep: String representation of missing values
- header: Include column headers
- quotechar: Quote character for strings
- encoding: Text encoding for output file
- verbose: Print progress information
Returns:
None if file specified, CSV string otherwise
"""import datatable as dt
# Read from file path
DT = dt.fread("data.csv")
# Read with specific separator
DT = dt.fread("data.tsv", sep='\t')
# Read from URL
DT = dt.fread("https://example.com/data.csv")
# Read from compressed file
DT = dt.fread("data.csv.gz")
# Read only first 1000 rows
DT = dt.fread("large_data.csv", max_nrows=1000)# Custom missing value strings
DT = dt.fread("data.csv", na_strings=['NULL', 'missing', ''])
# Skip header rows
DT = dt.fread("data.csv", skip_to_line=3)
# Skip to specific string
DT = dt.fread("data.csv", skip_to_string="START_DATA")
# Select specific columns
DT = dt.fread("data.csv", columns=['col1', 'col3', 'col5'])
# Control threading
DT = dt.fread("data.csv", nthreads=4)
# Verbose output
DT = dt.fread("data.csv", verbose=True)# Read from string
csv_text = """A,B,C
1,x,1.1
2,y,2.2
3,z,3.3"""
DT = dt.fread(text=csv_text)
# Read from shell command
DT = dt.fread(cmd="curl https://example.com/data.csv")
# Read from file-like object
with open("data.csv", 'r') as f:
DT = dt.fread(f)
# Read multiple files
DT = dt.fread(["file1.csv", "file2.csv"], multiple='rbind')# Process large files in chunks
for chunk in dt.iread("very_large_file.csv", max_nrows=10000):
# Process each chunk
processed = chunk[:, dt.sum(f.value)]
# Save or accumulate results
# Memory-efficient aggregation of large files
total = 0
count = 0
for chunk in dt.iread("huge_data.csv"):
total += chunk[:, dt.sum(f.amount)][0, 0]
count += chunk.nrows
average = total / count# Automatic format detection
DT = dt.fread("data.txt") # Auto-detects separator
DT = dt.fread("data.psv") # Pipe-separated values
DT = dt.fread("fixed_width.txt") # Fixed-width format
# Override auto-detection
DT = dt.fread("data.txt", sep='|')
DT = dt.fread("data.csv", header=False)import datatable as dt
DT = dt.Frame({
'A': [1, 2, 3, 4, 5],
'B': ['a', 'b', 'c', 'd', 'e'],
'C': [1.1, 2.2, 3.3, 4.4, 5.5]
})
# Write to file
DT.to_csv("output.csv")
# Write with custom separator
DT.to_csv("output.tsv", sep='\t')
# Write without header
DT.to_csv("output.csv", header=False)
# Custom missing value representation
DT.to_csv("output.csv", na_rep='NULL')# Get CSV as string
csv_string = DT.to_csv()
print(csv_string)
# Custom formatting
csv_string = DT.to_csv(sep='|', quotechar="'")# Write large frames efficiently
large_DT = dt.Frame({'x': range(10000000)})
large_DT.to_csv("large_output.csv", verbose=True)
# Append to existing file (using Python file handling)
with open("growing_file.csv", 'a') as f:
chunk_csv = chunk.to_csv(header=False)
f.write(chunk_csv)# Use multiple threads for large files
DT = dt.fread("big_file.csv", nthreads=8)
# Pre-specify column types for faster parsing
DT = dt.fread("data.csv", columns={'A': dt.int32, 'B': dt.str32})
# Limit columns for faster reading
DT = dt.fread("wide_data.csv", columns=['col1', 'col3', 'col7'])
# Use incremental reading for very large files
for chunk in dt.iread("massive_file.csv", max_nrows=100000):
# Process incrementally to avoid memory issues
pass# Memory-mapped reading for out-of-core processing
DT = dt.fread("huge_file.csv") # Uses memory mapping automatically
# Process data in chunks to control memory usage
def process_large_file(filename):
results = []
for chunk in dt.iread(filename, max_nrows=50000):
result = chunk[:, dt.sum(f.value), dt.by(f.category)]
results.append(result)
return dt.rbind(*results)try:
DT = dt.fread("might_not_exist.csv")
except dt.exceptions.IOError as e:
print(f"File reading failed: {e}")
try:
DT = dt.fread("malformed.csv")
except dt.exceptions.ValueError as e:
print(f"Parsing error: {e}")
# Graceful handling of missing files
import os
if os.path.exists("data.csv"):
DT = dt.fread("data.csv")
else:
DT = dt.Frame() # Empty frame as fallbackInstall with Tessl CLI
npx tessl i tessl/pypi-datatable