Tessl Tile for pypi/datatable@1.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-data-structures.md data-manipulation.md expression-system.md file-io.md index.md mathematical-functions.md reductions-aggregations.md row-operations.md set-operations.md string-operations.md time-operations.md type-system.md

file-io.mddocs/

0
# File I/O Operations
1

2
High-performance reading and writing of various file formats with automatic type detection, memory-efficient processing, and support for large datasets.
3

4
## Capabilities
5

6
### Fast File Reading
7

8
High-performance CSV and text file reading with automatic type detection and parallel processing.
9

10
```python { .api }
11
def fread(anysource=None, *, file=None, text=None, cmd=None, url=None,
12
          sep=None, dec='.', max_nrows=None, header=None, na_strings=None,
13
          verbose=False, fill=False, encoding=None, quotechar=None,
14
          skip_to_string=None, skip_to_line=None, skip_blank_lines=False,
15
          strip_whitespace=True, columns=None, nthreads=None, logger=None,
16
          multiple=None, **kwargs) -> Frame:
17
    """
18
    Read text/CSV files into a datatable Frame with high performance.
19
    
20
    Parameters:
21
    - anysource: File path, URL, text string, or file-like object
22
    - file: File path (alternative to anysource)
23
    - text: Text string to parse (alternative to anysource)
24
    - cmd: Shell command output to read (alternative to anysource)
25
    - url: URL to read from (alternative to anysource)
26
    - sep: Field separator character (auto-detected if None)
27
    - dec: Decimal point character (default '.')
28
    - max_nrows: Maximum number of rows to read
29
    - header: Whether first row contains headers (auto-detected if None)
30
    - na_strings: Additional strings to treat as missing values
31
    - verbose: Print progress information
32
    - fill: Fill incomplete rows with NAs
33
    - encoding: Text encoding (auto-detected if None)
34
    - quotechar: Quote character (auto-detected if None)
35
    - skip_to_string: Skip lines until this string is found
36
    - skip_to_line: Skip this number of lines at start
37
    - skip_blank_lines: Skip blank lines
38
    - strip_whitespace: Strip whitespace from string fields
39
    - columns: Select specific columns to read
40
    - nthreads: Number of threads to use (auto-detected if None)
41
    - logger: Custom logger for progress messages
42
    - multiple: How to handle multiple files
43
    
44
    Returns:
45
    Frame object containing the parsed data
46
    """
47

48
def iread(anysource=None, *, file=None, text=None, cmd=None, url=None,
49
          **kwargs):
50
    """
51
    Incremental reader that yields Frame chunks for large files.
52
    
53
    Parameters: Same as fread()
54
    
55
    Yields:
56
    Frame objects for each chunk of data
57
    """
58
```
59

60
### File Writing
61

62
Write Frame data to various output formats with customizable formatting options.
63

64
```python { .api }
65
# Frame method for CSV output
66
def to_csv(self, file=None, *, sep=',', na_rep='', header=True,
67
           quotechar='"', encoding='utf-8', verbose=False, **kwargs):
68
    """
69
    Write Frame to CSV file or return as string.
70
    
71
    Parameters:
72
    - file: Output file path (returns string if None)
73
    - sep: Field separator character
74
    - na_rep: String representation of missing values
75
    - header: Include column headers
76
    - quotechar: Quote character for strings
77
    - encoding: Text encoding for output file
78
    - verbose: Print progress information
79
    
80
    Returns:
81
    None if file specified, CSV string otherwise
82
    """
83
```
84

85
## File Reading Examples
86

87
### Basic CSV Reading
88

89
```python
90
import datatable as dt
91

92
# Read from file path
93
DT = dt.fread("data.csv")
94

95
# Read with specific separator
96
DT = dt.fread("data.tsv", sep='\t')
97

98
# Read from URL
99
DT = dt.fread("https://example.com/data.csv")
100

101
# Read from compressed file
102
DT = dt.fread("data.csv.gz")
103

104
# Read only first 1000 rows
105
DT = dt.fread("large_data.csv", max_nrows=1000)
106
```
107

108
### Advanced Reading Options
109

110
```python
111
# Custom missing value strings
112
DT = dt.fread("data.csv", na_strings=['NULL', 'missing', ''])
113

114
# Skip header rows
115
DT = dt.fread("data.csv", skip_to_line=3)
116

117
# Skip to specific string
118
DT = dt.fread("data.csv", skip_to_string="START_DATA")
119

120
# Select specific columns
121
DT = dt.fread("data.csv", columns=['col1', 'col3', 'col5'])
122

123
# Control threading
124
DT = dt.fread("data.csv", nthreads=4)
125

126
# Verbose output
127
DT = dt.fread("data.csv", verbose=True)
128
```
129

130
### Reading from Different Sources
131

132
```python
133
# Read from string
134
csv_text = """A,B,C
135
1,x,1.1
136
2,y,2.2
137
3,z,3.3"""
138
DT = dt.fread(text=csv_text)
139

140
# Read from shell command
141
DT = dt.fread(cmd="curl https://example.com/data.csv")
142

143
# Read from file-like object
144
with open("data.csv", 'r') as f:
145
    DT = dt.fread(f)
146

147
# Read multiple files
148
DT = dt.fread(["file1.csv", "file2.csv"], multiple='rbind')
149
```
150

151
### Incremental Reading
152

153
```python
154
# Process large files in chunks
155
for chunk in dt.iread("very_large_file.csv", max_nrows=10000):
156
    # Process each chunk
157
    processed = chunk[:, dt.sum(f.value)]
158
    # Save or accumulate results
159
    
160
# Memory-efficient aggregation of large files
161
total = 0
162
count = 0
163
for chunk in dt.iread("huge_data.csv"):
164
    total += chunk[:, dt.sum(f.amount)][0, 0]
165
    count += chunk.nrows
166

167
average = total / count
168
```
169

170
### File Format Detection
171

172
```python
173
# Automatic format detection
174
DT = dt.fread("data.txt")      # Auto-detects separator
175
DT = dt.fread("data.psv")      # Pipe-separated values
176
DT = dt.fread("fixed_width.txt")  # Fixed-width format
177

178
# Override auto-detection
179
DT = dt.fread("data.txt", sep='|')
180
DT = dt.fread("data.csv", header=False)
181
```
182

183
## File Writing Examples
184

185
### Basic CSV Writing
186

187
```python
188
import datatable as dt
189

190
DT = dt.Frame({
191
    'A': [1, 2, 3, 4, 5],
192
    'B': ['a', 'b', 'c', 'd', 'e'],
193
    'C': [1.1, 2.2, 3.3, 4.4, 5.5]
194
})
195

196
# Write to file
197
DT.to_csv("output.csv")
198

199
# Write with custom separator
200
DT.to_csv("output.tsv", sep='\t')
201

202
# Write without header
203
DT.to_csv("output.csv", header=False)
204

205
# Custom missing value representation
206
DT.to_csv("output.csv", na_rep='NULL')
207
```
208

209
### String Output
210

211
```python
212
# Get CSV as string
213
csv_string = DT.to_csv()
214
print(csv_string)
215

216
# Custom formatting
217
csv_string = DT.to_csv(sep='|', quotechar="'")
218
```
219

220
### Large File Writing
221

222
```python
223
# Write large frames efficiently
224
large_DT = dt.Frame({'x': range(10000000)})
225
large_DT.to_csv("large_output.csv", verbose=True)
226

227
# Append to existing file (using Python file handling)
228
with open("growing_file.csv", 'a') as f:
229
    chunk_csv = chunk.to_csv(header=False)
230
    f.write(chunk_csv)
231
```
232

233
## Performance Considerations
234

235
### Reading Performance Tips
236

237
```python
238
# Use multiple threads for large files
239
DT = dt.fread("big_file.csv", nthreads=8)
240

241
# Pre-specify column types for faster parsing
242
DT = dt.fread("data.csv", columns={'A': dt.int32, 'B': dt.str32})
243

244
# Limit columns for faster reading
245
DT = dt.fread("wide_data.csv", columns=['col1', 'col3', 'col7'])
246

247
# Use incremental reading for very large files
248
for chunk in dt.iread("massive_file.csv", max_nrows=100000):
249
    # Process incrementally to avoid memory issues
250
    pass
251
```
252

253
### Memory Efficiency
254

255
```python
256
# Memory-mapped reading for out-of-core processing
257
DT = dt.fread("huge_file.csv")  # Uses memory mapping automatically
258

259
# Process data in chunks to control memory usage
260
def process_large_file(filename):
261
    results = []
262
    for chunk in dt.iread(filename, max_nrows=50000):
263
        result = chunk[:, dt.sum(f.value), dt.by(f.category)]
264
        results.append(result)
265
    return dt.rbind(*results)
266
```
267

268
## Error Handling
269

270
```python
271
try:
272
    DT = dt.fread("might_not_exist.csv")
273
except dt.exceptions.IOError as e:
274
    print(f"File reading failed: {e}")
275

276
try:
277
    DT = dt.fread("malformed.csv")
278
except dt.exceptions.ValueError as e:
279
    print(f"Parsing error: {e}")
280

281
# Graceful handling of missing files
282
import os
283
if os.path.exists("data.csv"):
284
    DT = dt.fread("data.csv")
285
else:
286
    DT = dt.Frame()  # Empty frame as fallback
287
```

Version

Tile

Files

file-io.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

file-io.mddocs/