or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-data-structures.mddata-manipulation.mdexpression-system.mdfile-io.mdindex.mdmathematical-functions.mdreductions-aggregations.mdrow-operations.mdset-operations.mdstring-operations.mdtime-operations.mdtype-system.md

file-io.mddocs/

0

# File I/O Operations

1

2

High-performance reading and writing of various file formats with automatic type detection, memory-efficient processing, and support for large datasets.

3

4

## Capabilities

5

6

### Fast File Reading

7

8

High-performance CSV and text file reading with automatic type detection and parallel processing.

9

10

```python { .api }

11

def fread(anysource=None, *, file=None, text=None, cmd=None, url=None,

12

sep=None, dec='.', max_nrows=None, header=None, na_strings=None,

13

verbose=False, fill=False, encoding=None, quotechar=None,

14

skip_to_string=None, skip_to_line=None, skip_blank_lines=False,

15

strip_whitespace=True, columns=None, nthreads=None, logger=None,

16

multiple=None, **kwargs) -> Frame:

17

"""

18

Read text/CSV files into a datatable Frame with high performance.

19

20

Parameters:

21

- anysource: File path, URL, text string, or file-like object

22

- file: File path (alternative to anysource)

23

- text: Text string to parse (alternative to anysource)

24

- cmd: Shell command output to read (alternative to anysource)

25

- url: URL to read from (alternative to anysource)

26

- sep: Field separator character (auto-detected if None)

27

- dec: Decimal point character (default '.')

28

- max_nrows: Maximum number of rows to read

29

- header: Whether first row contains headers (auto-detected if None)

30

- na_strings: Additional strings to treat as missing values

31

- verbose: Print progress information

32

- fill: Fill incomplete rows with NAs

33

- encoding: Text encoding (auto-detected if None)

34

- quotechar: Quote character (auto-detected if None)

35

- skip_to_string: Skip lines until this string is found

36

- skip_to_line: Skip this number of lines at start

37

- skip_blank_lines: Skip blank lines

38

- strip_whitespace: Strip whitespace from string fields

39

- columns: Select specific columns to read

40

- nthreads: Number of threads to use (auto-detected if None)

41

- logger: Custom logger for progress messages

42

- multiple: How to handle multiple files

43

44

Returns:

45

Frame object containing the parsed data

46

"""

47

48

def iread(anysource=None, *, file=None, text=None, cmd=None, url=None,

49

**kwargs):

50

"""

51

Incremental reader that yields Frame chunks for large files.

52

53

Parameters: Same as fread()

54

55

Yields:

56

Frame objects for each chunk of data

57

"""

58

```

59

60

### File Writing

61

62

Write Frame data to various output formats with customizable formatting options.

63

64

```python { .api }

65

# Frame method for CSV output

66

def to_csv(self, file=None, *, sep=',', na_rep='', header=True,

67

quotechar='"', encoding='utf-8', verbose=False, **kwargs):

68

"""

69

Write Frame to CSV file or return as string.

70

71

Parameters:

72

- file: Output file path (returns string if None)

73

- sep: Field separator character

74

- na_rep: String representation of missing values

75

- header: Include column headers

76

- quotechar: Quote character for strings

77

- encoding: Text encoding for output file

78

- verbose: Print progress information

79

80

Returns:

81

None if file specified, CSV string otherwise

82

"""

83

```

84

85

## File Reading Examples

86

87

### Basic CSV Reading

88

89

```python

90

import datatable as dt

91

92

# Read from file path

93

DT = dt.fread("data.csv")

94

95

# Read with specific separator

96

DT = dt.fread("data.tsv", sep='\t')

97

98

# Read from URL

99

DT = dt.fread("https://example.com/data.csv")

100

101

# Read from compressed file

102

DT = dt.fread("data.csv.gz")

103

104

# Read only first 1000 rows

105

DT = dt.fread("large_data.csv", max_nrows=1000)

106

```

107

108

### Advanced Reading Options

109

110

```python

111

# Custom missing value strings

112

DT = dt.fread("data.csv", na_strings=['NULL', 'missing', ''])

113

114

# Skip header rows

115

DT = dt.fread("data.csv", skip_to_line=3)

116

117

# Skip to specific string

118

DT = dt.fread("data.csv", skip_to_string="START_DATA")

119

120

# Select specific columns

121

DT = dt.fread("data.csv", columns=['col1', 'col3', 'col5'])

122

123

# Control threading

124

DT = dt.fread("data.csv", nthreads=4)

125

126

# Verbose output

127

DT = dt.fread("data.csv", verbose=True)

128

```

129

130

### Reading from Different Sources

131

132

```python

133

# Read from string

134

csv_text = """A,B,C

135

1,x,1.1

136

2,y,2.2

137

3,z,3.3"""

138

DT = dt.fread(text=csv_text)

139

140

# Read from shell command

141

DT = dt.fread(cmd="curl https://example.com/data.csv")

142

143

# Read from file-like object

144

with open("data.csv", 'r') as f:

145

DT = dt.fread(f)

146

147

# Read multiple files

148

DT = dt.fread(["file1.csv", "file2.csv"], multiple='rbind')

149

```

150

151

### Incremental Reading

152

153

```python

154

# Process large files in chunks

155

for chunk in dt.iread("very_large_file.csv", max_nrows=10000):

156

# Process each chunk

157

processed = chunk[:, dt.sum(f.value)]

158

# Save or accumulate results

159

160

# Memory-efficient aggregation of large files

161

total = 0

162

count = 0

163

for chunk in dt.iread("huge_data.csv"):

164

total += chunk[:, dt.sum(f.amount)][0, 0]

165

count += chunk.nrows

166

167

average = total / count

168

```

169

170

### File Format Detection

171

172

```python

173

# Automatic format detection

174

DT = dt.fread("data.txt") # Auto-detects separator

175

DT = dt.fread("data.psv") # Pipe-separated values

176

DT = dt.fread("fixed_width.txt") # Fixed-width format

177

178

# Override auto-detection

179

DT = dt.fread("data.txt", sep='|')

180

DT = dt.fread("data.csv", header=False)

181

```

182

183

## File Writing Examples

184

185

### Basic CSV Writing

186

187

```python

188

import datatable as dt

189

190

DT = dt.Frame({

191

'A': [1, 2, 3, 4, 5],

192

'B': ['a', 'b', 'c', 'd', 'e'],

193

'C': [1.1, 2.2, 3.3, 4.4, 5.5]

194

})

195

196

# Write to file

197

DT.to_csv("output.csv")

198

199

# Write with custom separator

200

DT.to_csv("output.tsv", sep='\t')

201

202

# Write without header

203

DT.to_csv("output.csv", header=False)

204

205

# Custom missing value representation

206

DT.to_csv("output.csv", na_rep='NULL')

207

```

208

209

### String Output

210

211

```python

212

# Get CSV as string

213

csv_string = DT.to_csv()

214

print(csv_string)

215

216

# Custom formatting

217

csv_string = DT.to_csv(sep='|', quotechar="'")

218

```

219

220

### Large File Writing

221

222

```python

223

# Write large frames efficiently

224

large_DT = dt.Frame({'x': range(10000000)})

225

large_DT.to_csv("large_output.csv", verbose=True)

226

227

# Append to existing file (using Python file handling)

228

with open("growing_file.csv", 'a') as f:

229

chunk_csv = chunk.to_csv(header=False)

230

f.write(chunk_csv)

231

```

232

233

## Performance Considerations

234

235

### Reading Performance Tips

236

237

```python

238

# Use multiple threads for large files

239

DT = dt.fread("big_file.csv", nthreads=8)

240

241

# Pre-specify column types for faster parsing

242

DT = dt.fread("data.csv", columns={'A': dt.int32, 'B': dt.str32})

243

244

# Limit columns for faster reading

245

DT = dt.fread("wide_data.csv", columns=['col1', 'col3', 'col7'])

246

247

# Use incremental reading for very large files

248

for chunk in dt.iread("massive_file.csv", max_nrows=100000):

249

# Process incrementally to avoid memory issues

250

pass

251

```

252

253

### Memory Efficiency

254

255

```python

256

# Memory-mapped reading for out-of-core processing

257

DT = dt.fread("huge_file.csv") # Uses memory mapping automatically

258

259

# Process data in chunks to control memory usage

260

def process_large_file(filename):

261

results = []

262

for chunk in dt.iread(filename, max_nrows=50000):

263

result = chunk[:, dt.sum(f.value), dt.by(f.category)]

264

results.append(result)

265

return dt.rbind(*results)

266

```

267

268

## Error Handling

269

270

```python

271

try:

272

DT = dt.fread("might_not_exist.csv")

273

except dt.exceptions.IOError as e:

274

print(f"File reading failed: {e}")

275

276

try:

277

DT = dt.fread("malformed.csv")

278

except dt.exceptions.ValueError as e:

279

print(f"Parsing error: {e}")

280

281

# Graceful handling of missing files

282

import os

283

if os.path.exists("data.csv"):

284

DT = dt.fread("data.csv")

285

else:

286

DT = dt.Frame() # Empty frame as fallback

287

```