0
# File I/O Operations
1
2
High-performance reading and writing of various file formats with automatic type detection, memory-efficient processing, and support for large datasets.
3
4
## Capabilities
5
6
### Fast File Reading
7
8
High-performance CSV and text file reading with automatic type detection and parallel processing.
9
10
```python { .api }
11
def fread(anysource=None, *, file=None, text=None, cmd=None, url=None,
12
sep=None, dec='.', max_nrows=None, header=None, na_strings=None,
13
verbose=False, fill=False, encoding=None, quotechar=None,
14
skip_to_string=None, skip_to_line=None, skip_blank_lines=False,
15
strip_whitespace=True, columns=None, nthreads=None, logger=None,
16
multiple=None, **kwargs) -> Frame:
17
"""
18
Read text/CSV files into a datatable Frame with high performance.
19
20
Parameters:
21
- anysource: File path, URL, text string, or file-like object
22
- file: File path (alternative to anysource)
23
- text: Text string to parse (alternative to anysource)
24
- cmd: Shell command output to read (alternative to anysource)
25
- url: URL to read from (alternative to anysource)
26
- sep: Field separator character (auto-detected if None)
27
- dec: Decimal point character (default '.')
28
- max_nrows: Maximum number of rows to read
29
- header: Whether first row contains headers (auto-detected if None)
30
- na_strings: Additional strings to treat as missing values
31
- verbose: Print progress information
32
- fill: Fill incomplete rows with NAs
33
- encoding: Text encoding (auto-detected if None)
34
- quotechar: Quote character (auto-detected if None)
35
- skip_to_string: Skip lines until this string is found
36
- skip_to_line: Skip this number of lines at start
37
- skip_blank_lines: Skip blank lines
38
- strip_whitespace: Strip whitespace from string fields
39
- columns: Select specific columns to read
40
- nthreads: Number of threads to use (auto-detected if None)
41
- logger: Custom logger for progress messages
42
- multiple: How to handle multiple files
43
44
Returns:
45
Frame object containing the parsed data
46
"""
47
48
def iread(anysource=None, *, file=None, text=None, cmd=None, url=None,
49
**kwargs):
50
"""
51
Incremental reader that yields Frame chunks for large files.
52
53
Parameters: Same as fread()
54
55
Yields:
56
Frame objects for each chunk of data
57
"""
58
```
59
60
### File Writing
61
62
Write Frame data to various output formats with customizable formatting options.
63
64
```python { .api }
65
# Frame method for CSV output
66
def to_csv(self, file=None, *, sep=',', na_rep='', header=True,
67
quotechar='"', encoding='utf-8', verbose=False, **kwargs):
68
"""
69
Write Frame to CSV file or return as string.
70
71
Parameters:
72
- file: Output file path (returns string if None)
73
- sep: Field separator character
74
- na_rep: String representation of missing values
75
- header: Include column headers
76
- quotechar: Quote character for strings
77
- encoding: Text encoding for output file
78
- verbose: Print progress information
79
80
Returns:
81
None if file specified, CSV string otherwise
82
"""
83
```
84
85
## File Reading Examples
86
87
### Basic CSV Reading
88
89
```python
90
import datatable as dt
91
92
# Read from file path
93
DT = dt.fread("data.csv")
94
95
# Read with specific separator
96
DT = dt.fread("data.tsv", sep='\t')
97
98
# Read from URL
99
DT = dt.fread("https://example.com/data.csv")
100
101
# Read from compressed file
102
DT = dt.fread("data.csv.gz")
103
104
# Read only first 1000 rows
105
DT = dt.fread("large_data.csv", max_nrows=1000)
106
```
107
108
### Advanced Reading Options
109
110
```python
111
# Custom missing value strings
112
DT = dt.fread("data.csv", na_strings=['NULL', 'missing', ''])
113
114
# Skip header rows
115
DT = dt.fread("data.csv", skip_to_line=3)
116
117
# Skip to specific string
118
DT = dt.fread("data.csv", skip_to_string="START_DATA")
119
120
# Select specific columns
121
DT = dt.fread("data.csv", columns=['col1', 'col3', 'col5'])
122
123
# Control threading
124
DT = dt.fread("data.csv", nthreads=4)
125
126
# Verbose output
127
DT = dt.fread("data.csv", verbose=True)
128
```
129
130
### Reading from Different Sources
131
132
```python
133
# Read from string
134
csv_text = """A,B,C
135
1,x,1.1
136
2,y,2.2
137
3,z,3.3"""
138
DT = dt.fread(text=csv_text)
139
140
# Read from shell command
141
DT = dt.fread(cmd="curl https://example.com/data.csv")
142
143
# Read from file-like object
144
with open("data.csv", 'r') as f:
145
DT = dt.fread(f)
146
147
# Read multiple files
148
DT = dt.fread(["file1.csv", "file2.csv"], multiple='rbind')
149
```
150
151
### Incremental Reading
152
153
```python
154
# Process large files in chunks
155
for chunk in dt.iread("very_large_file.csv", max_nrows=10000):
156
# Process each chunk
157
processed = chunk[:, dt.sum(f.value)]
158
# Save or accumulate results
159
160
# Memory-efficient aggregation of large files
161
total = 0
162
count = 0
163
for chunk in dt.iread("huge_data.csv"):
164
total += chunk[:, dt.sum(f.amount)][0, 0]
165
count += chunk.nrows
166
167
average = total / count
168
```
169
170
### File Format Detection
171
172
```python
173
# Automatic format detection
174
DT = dt.fread("data.txt") # Auto-detects separator
175
DT = dt.fread("data.psv") # Pipe-separated values
176
DT = dt.fread("fixed_width.txt") # Fixed-width format
177
178
# Override auto-detection
179
DT = dt.fread("data.txt", sep='|')
180
DT = dt.fread("data.csv", header=False)
181
```
182
183
## File Writing Examples
184
185
### Basic CSV Writing
186
187
```python
188
import datatable as dt
189
190
DT = dt.Frame({
191
'A': [1, 2, 3, 4, 5],
192
'B': ['a', 'b', 'c', 'd', 'e'],
193
'C': [1.1, 2.2, 3.3, 4.4, 5.5]
194
})
195
196
# Write to file
197
DT.to_csv("output.csv")
198
199
# Write with custom separator
200
DT.to_csv("output.tsv", sep='\t')
201
202
# Write without header
203
DT.to_csv("output.csv", header=False)
204
205
# Custom missing value representation
206
DT.to_csv("output.csv", na_rep='NULL')
207
```
208
209
### String Output
210
211
```python
212
# Get CSV as string
213
csv_string = DT.to_csv()
214
print(csv_string)
215
216
# Custom formatting
217
csv_string = DT.to_csv(sep='|', quotechar="'")
218
```
219
220
### Large File Writing
221
222
```python
223
# Write large frames efficiently
224
large_DT = dt.Frame({'x': range(10000000)})
225
large_DT.to_csv("large_output.csv", verbose=True)
226
227
# Append to existing file (using Python file handling)
228
with open("growing_file.csv", 'a') as f:
229
chunk_csv = chunk.to_csv(header=False)
230
f.write(chunk_csv)
231
```
232
233
## Performance Considerations
234
235
### Reading Performance Tips
236
237
```python
238
# Use multiple threads for large files
239
DT = dt.fread("big_file.csv", nthreads=8)
240
241
# Pre-specify column types for faster parsing
242
DT = dt.fread("data.csv", columns={'A': dt.int32, 'B': dt.str32})
243
244
# Limit columns for faster reading
245
DT = dt.fread("wide_data.csv", columns=['col1', 'col3', 'col7'])
246
247
# Use incremental reading for very large files
248
for chunk in dt.iread("massive_file.csv", max_nrows=100000):
249
# Process incrementally to avoid memory issues
250
pass
251
```
252
253
### Memory Efficiency
254
255
```python
256
# Memory-mapped reading for out-of-core processing
257
DT = dt.fread("huge_file.csv") # Uses memory mapping automatically
258
259
# Process data in chunks to control memory usage
260
def process_large_file(filename):
261
results = []
262
for chunk in dt.iread(filename, max_nrows=50000):
263
result = chunk[:, dt.sum(f.value), dt.by(f.category)]
264
results.append(result)
265
return dt.rbind(*results)
266
```
267
268
## Error Handling
269
270
```python
271
try:
272
DT = dt.fread("might_not_exist.csv")
273
except dt.exceptions.IOError as e:
274
print(f"File reading failed: {e}")
275
276
try:
277
DT = dt.fread("malformed.csv")
278
except dt.exceptions.ValueError as e:
279
print(f"Parsing error: {e}")
280
281
# Graceful handling of missing files
282
import os
283
if os.path.exists("data.csv"):
284
DT = dt.fread("data.csv")
285
else:
286
DT = dt.Frame() # Empty frame as fallback
287
```