or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-reading-writing.mddata-reading.mddata-writing.mddialect-detection.mddialects-configuration.mddictionary-operations.mdindex.md

dialects-configuration.mddocs/

0

# Dialects and Configuration

1

2

Dialect classes and configuration utilities for managing CSV parsing parameters. CleverCSV provides enhanced dialect support with the SimpleDialect class and utilities for working with various CSV formats and configurations.

3

4

## Capabilities

5

6

### SimpleDialect Class

7

8

Enhanced dialect class that provides a simplified and more flexible alternative to Python's csv.Dialect, with better support for CleverCSV's detection algorithms.

9

10

```python { .api }

11

class SimpleDialect:

12

"""

13

Simplified dialect object for CSV parsing configuration.

14

15

For delimiter, quotechar, and escapechar:

16

- Empty string ('') means no delimiter/quotechar/escapechar in the file

17

- None is used to mark it as undefined

18

"""

19

20

def __init__(

21

self,

22

delimiter: Optional[str],

23

quotechar: Optional[str],

24

escapechar: Optional[str],

25

strict: bool = False

26

):

27

"""

28

Create a new SimpleDialect.

29

30

Parameters:

31

- delimiter: Field delimiter character

32

- quotechar: Quote character for fields containing special characters

33

- escapechar: Escape character for escaping delimiters/quotes

34

- strict: Whether to enforce strict parsing

35

"""

36

37

def validate(self) -> None:

38

"""

39

Validate dialect parameters.

40

41

Raises:

42

ValueError: If any parameter is invalid

43

"""

44

45

def to_csv_dialect(self) -> csv.Dialect:

46

"""

47

Convert to standard csv.Dialect object.

48

49

Returns:

50

csv.Dialect compatible object

51

"""

52

53

def to_dict(self) -> Dict[str, Union[str, bool, None]]:

54

"""

55

Convert dialect to dictionary representation.

56

57

Returns:

58

Dictionary with dialect parameters

59

"""

60

61

def serialize(self) -> str:

62

"""

63

Serialize dialect to JSON string.

64

65

Returns:

66

JSON string representation of dialect

67

"""

68

69

@classmethod

70

def deserialize(cls, obj: str) -> 'SimpleDialect':

71

"""

72

Deserialize dialect from JSON string.

73

74

Parameters:

75

- obj: JSON string representation

76

77

Returns:

78

SimpleDialect instance

79

"""

80

81

@classmethod

82

def from_dict(cls, d: Dict[str, Any]) -> 'SimpleDialect':

83

"""

84

Create SimpleDialect from dictionary.

85

86

Parameters:

87

- d: Dictionary with dialect parameters

88

89

Returns:

90

SimpleDialect instance

91

"""

92

93

@classmethod

94

def from_csv_dialect(cls, d: csv.Dialect) -> 'SimpleDialect':

95

"""

96

Create SimpleDialect from csv.Dialect.

97

98

Parameters:

99

- d: csv.Dialect instance

100

101

Returns:

102

SimpleDialect instance

103

"""

104

```

105

106

#### Usage Examples

107

108

```python

109

import clevercsv

110

import json

111

112

# Create custom dialect

113

dialect = clevercsv.SimpleDialect(',', '"', '\\', strict=True)

114

print(f"Delimiter: '{dialect.delimiter}'")

115

print(f"Quote char: '{dialect.quotechar}'")

116

print(f"Escape char: '{dialect.escapechar}'")

117

118

# Validate dialect

119

try:

120

dialect.validate()

121

print("Dialect is valid")

122

except ValueError as e:

123

print(f"Invalid dialect: {e}")

124

125

# Convert to csv.Dialect for use with standard library

126

csv_dialect = dialect.to_csv_dialect()

127

with open('data.csv', 'r', newline='') as f:

128

reader = csv.reader(f, dialect=csv_dialect)

129

data = list(reader)

130

131

# Serialize dialect for storage

132

serialized = dialect.serialize()

133

print(f"Serialized: {serialized}")

134

135

# Deserialize dialect

136

restored_dialect = clevercsv.SimpleDialect.deserialize(serialized)

137

print(f"Restored: {restored_dialect}")

138

139

# Create from dictionary

140

dialect_dict = {'delimiter': ';', 'quotechar': "'", 'escapechar': '', 'strict': False}

141

dialect_from_dict = clevercsv.SimpleDialect.from_dict(dialect_dict)

142

143

# Create from csv.Dialect

144

csv_excel = csv.excel

145

simple_from_csv = clevercsv.SimpleDialect.from_csv_dialect(csv_excel)

146

```

147

148

### Predefined Dialects

149

150

CleverCSV provides access to standard CSV dialects for common formats.

151

152

```python { .api }

153

# Standard CSV dialects

154

excel: csv.Dialect # Excel-compatible format (comma-separated, quoted fields)

155

excel_tab: csv.Dialect # Excel tab-separated format

156

unix_dialect: csv.Dialect # Unix-style format (comma-separated, quoted fields, escaped quotes)

157

```

158

159

#### Usage Examples

160

161

```python

162

import clevercsv

163

164

# Use predefined dialects

165

with open('data.csv', 'r', newline='') as f:

166

reader = clevercsv.reader(f, dialect=clevercsv.excel)

167

data = list(reader)

168

169

# Compare dialects

170

print("Excel dialect:")

171

excel_simple = clevercsv.SimpleDialect.from_csv_dialect(clevercsv.excel)

172

print(f" Delimiter: '{excel_simple.delimiter}'")

173

print(f" Quote char: '{excel_simple.quotechar}'")

174

175

print("Unix dialect:")

176

unix_simple = clevercsv.SimpleDialect.from_csv_dialect(clevercsv.unix_dialect)

177

print(f" Delimiter: '{unix_simple.delimiter}'")

178

print(f" Quote char: '{unix_simple.quotechar}'")

179

print(f" Escape char: '{unix_simple.escapechar}'")

180

```

181

182

### Configuration Utilities

183

184

Utility functions for managing CSV parsing configuration and field size limits.

185

186

```python { .api }

187

def field_size_limit(*args, **kwargs) -> int:

188

"""

189

Get or set the field size limit for CSV parsing.

190

191

Parameters:

192

- limit (optional): New field size limit in characters

193

194

Returns:

195

Previous field size limit

196

197

Raises:

198

TypeError: If limit is not an integer or too many arguments provided

199

200

Notes:

201

- Default limit is 128KB (131,072 characters)

202

- Setting limit to 0 removes the limit (use with caution)

203

- Large limits may impact performance and memory usage

204

"""

205

```

206

207

#### Usage Examples

208

209

```python

210

import clevercsv

211

212

# Get current field size limit

213

current_limit = clevercsv.field_size_limit()

214

print(f"Current field size limit: {current_limit} characters")

215

216

# Set new field size limit

217

old_limit = clevercsv.field_size_limit(256 * 1024) # 256KB

218

print(f"Previous limit: {old_limit}, New limit: {clevercsv.field_size_limit()}")

219

220

# Remove field size limit (use with caution)

221

clevercsv.field_size_limit(0)

222

print("Field size limit removed")

223

224

# Restore reasonable limit

225

clevercsv.field_size_limit(128 * 1024) # 128KB default

226

```

227

228

## Advanced Dialect Management

229

230

### Custom Dialect Creation

231

232

Create specialized dialects for unique CSV formats:

233

234

```python

235

import clevercsv

236

237

def create_pipe_separated_dialect():

238

"""Create dialect for pipe-separated values."""

239

return clevercsv.SimpleDialect('|', '"', '\\')

240

241

def create_tab_separated_no_quotes():

242

"""Create dialect for tab-separated without quotes."""

243

return clevercsv.SimpleDialect('\t', '', '')

244

245

def create_semicolon_single_quotes():

246

"""Create dialect for semicolon-separated with single quotes."""

247

return clevercsv.SimpleDialect(';', "'", '')

248

249

# Usage

250

pipe_dialect = create_pipe_separated_dialect()

251

with open('pipe_data.csv', 'r', newline='') as f:

252

reader = clevercsv.reader(f, dialect=pipe_dialect)

253

data = list(reader)

254

```

255

256

### Dialect Comparison and Analysis

257

258

Compare and analyze different dialects:

259

260

```python

261

import clevercsv

262

263

def compare_dialects(file_path, dialects):

264

"""Compare how different dialects parse the same file."""

265

266

results = {}

267

268

with open(file_path, 'r', newline='') as f:

269

sample = f.read(1000) # First 1000 characters

270

271

for name, dialect in dialects.items():

272

try:

273

# Parse sample with this dialect

274

rows = list(clevercsv.parse_string(sample, dialect))

275

results[name] = {

276

'rows': len(rows),

277

'columns': len(rows[0]) if rows else 0,

278

'sample_row': rows[0] if rows else []

279

}

280

except Exception as e:

281

results[name] = {'error': str(e)}

282

283

return results

284

285

# Usage

286

dialects = {

287

'comma': clevercsv.SimpleDialect(',', '"', ''),

288

'semicolon': clevercsv.SimpleDialect(';', '"', ''),

289

'pipe': clevercsv.SimpleDialect('|', '"', ''),

290

'tab': clevercsv.SimpleDialect('\t', '"', '')

291

}

292

293

comparison = compare_dialects('ambiguous.csv', dialects)

294

for name, result in comparison.items():

295

print(f"{name}: {result}")

296

```

297

298

### Dialect Persistence

299

300

Save and load dialect configurations:

301

302

```python

303

import clevercsv

304

import json

305

306

class DialectManager:

307

"""Manage dialect configurations with persistence."""

308

309

def __init__(self, config_file='dialects.json'):

310

self.config_file = config_file

311

self.dialects = {}

312

self.load_dialects()

313

314

def save_dialect(self, name, dialect):

315

"""Save a dialect configuration."""

316

self.dialects[name] = dialect.to_dict()

317

self._save_to_file()

318

319

def load_dialect(self, name):

320

"""Load a dialect configuration."""

321

if name in self.dialects:

322

return clevercsv.SimpleDialect.from_dict(self.dialects[name])

323

return None

324

325

def list_dialects(self):

326

"""List all saved dialects."""

327

return list(self.dialects.keys())

328

329

def delete_dialect(self, name):

330

"""Delete a dialect configuration."""

331

if name in self.dialects:

332

del self.dialects[name]

333

self._save_to_file()

334

335

def load_dialects(self):

336

"""Load dialects from file."""

337

try:

338

with open(self.config_file, 'r') as f:

339

self.dialects = json.load(f)

340

except FileNotFoundError:

341

self.dialects = {}

342

343

def _save_to_file(self):

344

"""Save dialects to file."""

345

with open(self.config_file, 'w') as f:

346

json.dump(self.dialects, f, indent=2)

347

348

# Usage

349

manager = DialectManager()

350

351

# Save custom dialects

352

custom_dialect = clevercsv.SimpleDialect('|', "'", '\\')

353

manager.save_dialect('pipe_single_quote', custom_dialect)

354

355

# Load and use saved dialect

356

loaded_dialect = manager.load_dialect('pipe_single_quote')

357

if loaded_dialect:

358

with open('data.csv', 'r', newline='') as f:

359

reader = clevercsv.reader(f, dialect=loaded_dialect)

360

data = list(reader)

361

```

362

363

## Dialect Detection Integration

364

365

### Combining Detection and Configuration

366

367

Use detected dialects with configuration management:

368

369

```python

370

import clevercsv

371

372

def smart_csv_processing(file_path):

373

"""Process CSV with detection fallback to configuration."""

374

375

# Try automatic detection first

376

detected_dialect = clevercsv.detect_dialect(file_path)

377

378

if detected_dialect:

379

print(f"Using detected dialect: {detected_dialect}")

380

dialect = detected_dialect

381

else:

382

# Fallback to common dialects

383

print("Detection failed, trying common dialects...")

384

385

common_dialects = [

386

clevercsv.SimpleDialect(',', '"', ''), # Standard CSV

387

clevercsv.SimpleDialect(';', '"', ''), # European CSV

388

clevercsv.SimpleDialect('\t', '"', ''), # Tab-separated

389

clevercsv.SimpleDialect('|', '"', ''), # Pipe-separated

390

]

391

392

dialect = None

393

for test_dialect in common_dialects:

394

try:

395

with open(file_path, 'r', newline='') as f:

396

reader = clevercsv.reader(f, dialect=test_dialect)

397

first_row = next(reader)

398

if len(first_row) > 1: # Reasonable number of columns

399

dialect = test_dialect

400

print(f"Using fallback dialect: {dialect}")

401

break

402

except:

403

continue

404

405

if not dialect:

406

raise ValueError("Could not determine appropriate dialect")

407

408

# Process file with determined dialect

409

with open(file_path, 'r', newline='') as f:

410

reader = clevercsv.reader(f, dialect=dialect)

411

return list(reader)

412

413

# Usage

414

try:

415

data = smart_csv_processing('difficult_file.csv')

416

print(f"Successfully processed {len(data)} rows")

417

except ValueError as e:

418

print(f"Processing failed: {e}")

419

```

420

421

### Dialect Validation and Testing

422

423

Validate dialects against actual CSV files:

424

425

```python

426

import clevercsv

427

428

def validate_dialect_for_file(file_path, dialect):

429

"""Validate that a dialect works correctly for a file."""

430

431

validation_results = {

432

'valid': True,

433

'issues': [],

434

'statistics': {}

435

}

436

437

try:

438

with open(file_path, 'r', newline='') as f:

439

reader = clevercsv.reader(f, dialect=dialect)

440

rows = list(reader)

441

442

if not rows:

443

validation_results['valid'] = False

444

validation_results['issues'].append('No rows parsed')

445

return validation_results

446

447

# Check for consistent column count

448

column_counts = [len(row) for row in rows]

449

unique_counts = set(column_counts)

450

451

if len(unique_counts) > 1:

452

validation_results['issues'].append(

453

f'Inconsistent column counts: {sorted(unique_counts)}'

454

)

455

456

# Gather statistics

457

validation_results['statistics'] = {

458

'total_rows': len(rows),

459

'column_counts': dict(zip(*zip(*[(c, column_counts.count(c)) for c in unique_counts]))),

460

'average_columns': sum(column_counts) / len(column_counts),

461

'max_field_length': max(len(field) for row in rows for field in row) if rows else 0

462

}

463

464

except Exception as e:

465

validation_results['valid'] = False

466

validation_results['issues'].append(f'Parsing error: {str(e)}')

467

468

return validation_results

469

470

# Usage

471

test_dialect = clevercsv.SimpleDialect(',', '"', '')

472

results = validate_dialect_for_file('test.csv', test_dialect)

473

474

if results['valid']:

475

print("Dialect validation passed")

476

print(f"Statistics: {results['statistics']}")

477

else:

478

print("Dialect validation failed")

479

print(f"Issues: {results['issues']}")

480

```