or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

algorithm-configuration.mdcore-sorting.mdindex-sorting.mdindex.mdkey-generation.mdutilities.md

utilities.mddocs/

0

# Utilities and Text Processing

1

2

Utility functions for text processing, decoding, regular expression selection, and command-line interface functionality. These functions provide supporting capabilities for the core natsort functionality.

3

4

## Capabilities

5

6

### Text Decoding Functions

7

8

Functions for handling text encoding and decoding, particularly useful when working with mixed string and bytes data.

9

10

```python { .api }

11

def decoder(encoding):

12

"""

13

Return a function that can be used to decode bytes to unicode.

14

15

Parameters:

16

- encoding: str - The codec to use for decoding (must be valid unicode codec)

17

18

Returns:

19

Callable - A function that decodes bytes using the supplied codec

20

21

Examples:

22

>>> decode_utf8 = decoder('utf8')

23

>>> decode_utf8(b'hello') == 'hello'

24

True

25

>>> decode_utf8('already string') == 'already string'

26

True

27

"""

28

```

29

30

```python { .api }

31

def as_ascii(s):

32

"""

33

Function to decode an input with the ASCII codec, or return as-is.

34

35

Parameters:

36

- s: object - Input to potentially decode

37

38

Returns:

39

object - Decoded string if input was bytes, otherwise input unchanged

40

41

Examples:

42

>>> as_ascii(b'hello')

43

'hello'

44

>>> as_ascii('hello')

45

'hello'

46

>>> as_ascii(123)

47

123

48

"""

49

```

50

51

```python { .api }

52

def as_utf8(s):

53

"""

54

Function to decode an input with the UTF-8 codec, or return as-is.

55

56

Parameters:

57

- s: object - Input to potentially decode

58

59

Returns:

60

object - Decoded string if input was bytes, otherwise input unchanged

61

62

Examples:

63

>>> as_utf8(b'hello')

64

'hello'

65

>>> as_utf8('hello')

66

'hello'

67

>>> as_utf8(123)

68

123

69

"""

70

```

71

72

### Function Composition

73

74

Utility for chaining multiple single-argument functions together.

75

76

```python { .api }

77

def chain_functions(functions):

78

"""

79

Chain a list of single-argument functions together and return.

80

81

Functions are applied in list order, with the output of each function

82

passed as input to the next function.

83

84

Parameters:

85

- functions: Iterable[Callable] - List of single-argument functions to chain

86

87

Returns:

88

Callable - A single argument function that applies all chained functions

89

90

Examples:

91

>>> funcs = [lambda x: x * 4, len, lambda x: x + 5]

92

>>> chained = chain_functions(funcs)

93

>>> chained('hey') # 'hey' -> 'heyheyheyheyhey' -> 12 -> 17

94

17

95

"""

96

```

97

98

### Regular Expression Utilities

99

100

Function for selecting appropriate regular expressions for number matching based on algorithm settings.

101

102

```python { .api }

103

def numeric_regex_chooser(alg):

104

"""

105

Select an appropriate regex for the type of number of interest.

106

107

Parameters:

108

- alg: ns enum - Algorithm flags indicating the desired number type

109

110

Returns:

111

str - Regular expression string that matches the desired number type

112

113

Examples:

114

>>> numeric_regex_chooser(ns.INT)

115

r'(\d+|[unicode_digits])'

116

>>> numeric_regex_chooser(ns.FLOAT | ns.SIGNED)

117

r'([-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?|[unicode_numeric])'

118

"""

119

```

120

121

### Command Line Interface

122

123

Main entry point for the natsort command-line tool.

124

125

```python { .api }

126

def main(*arguments):

127

"""

128

Performs a natural sort on entries given on the command-line.

129

130

Entry point for the 'natsort' console script. Provides command-line

131

access to natural sorting with various options for number types,

132

filtering, and output formatting.

133

134

Parameters:

135

- *arguments: str - Command line arguments (uses sys.argv if not provided)

136

137

Command Line Options:

138

- -p, --paths: Interpret input as file paths

139

- -f, --filter LOW HIGH: Keep entries with numbers in range

140

- -F, --reverse-filter LOW HIGH: Exclude entries with numbers in range

141

- -e, --exclude NUMBER: Exclude entries containing specific number

142

- -r, --reverse: Return results in reversed order

143

- -t, --number-type {int,float,real}: Choose number interpretation

144

- -s, --sign: Consider +/- as part of numbers

145

- --noexp: Don't parse scientific notation

146

- -l, --locale: Use locale-aware sorting

147

148

Examples:

149

# Sort lines from stdin

150

$ echo -e "item10\nitem2\nitem1" | natsort

151

152

# Sort file paths

153

$ natsort --paths file10.txt file2.txt file1.txt

154

155

# Sort with real numbers and filtering

156

$ natsort --number-type real --filter -5 10 data.txt

157

"""

158

```

159

160

## Usage Examples

161

162

### Text Decoding with Mixed Data

163

164

```python

165

from natsort import natsorted, decoder, as_utf8, as_ascii

166

167

# Mixed bytes and string data

168

mixed_data = [b'file10.txt', 'file2.txt', b'file1.txt', 'file20.txt']

169

170

# Method 1: Using decoder function

171

utf8_decoder = decoder('utf-8')

172

decoded_data = [utf8_decoder(item) for item in mixed_data]

173

sorted_decoded = natsorted(decoded_data)

174

print(f"Decoded and sorted: {sorted_decoded}")

175

176

# Method 2: Using as_utf8 directly in key function

177

sorted_mixed = natsorted(mixed_data, key=as_utf8)

178

print(f"Sorted with UTF-8 key: {sorted_mixed}")

179

180

# Method 3: Using as_ascii for ASCII-only data

181

ascii_mixed = [b'fileA.txt', 'fileB.txt', b'file1.txt']

182

sorted_ascii = natsorted(ascii_mixed, key=as_ascii)

183

print(f"Sorted with ASCII key: {sorted_ascii}")

184

```

185

186

### Function Chaining for Complex Transformations

187

188

```python

189

from natsort import natsorted, chain_functions

190

from pathlib import Path

191

192

# File paths that need complex preprocessing

193

file_paths = [

194

'/home/user/Documents/Project_v1.10.txt',

195

'/home/user/Documents/Project_v1.2.txt',

196

'/var/log/system_log_v2.1.txt',

197

'/tmp/temp_file_v1.0.txt'

198

]

199

200

# Chain of transformations: Path -> filename -> lowercase -> remove extension

201

transform_chain = chain_functions([

202

lambda x: Path(x).name, # Get filename only

203

lambda x: x.lower(), # Convert to lowercase

204

lambda x: x.rsplit('.', 1)[0] # Remove extension

205

])

206

207

# Sort using the chained transformation

208

sorted_files = natsorted(file_paths, key=transform_chain)

209

print("Sorted by transformed filename:")

210

for original, sorted_path in zip(file_paths, sorted_files):

211

transformed = transform_chain(original)

212

print(f" {original} -> '{transformed}'")

213

```

214

215

### Regular Expression Exploration

216

217

```python

218

from natsort import numeric_regex_chooser, ns

219

import re

220

221

# Explore different regex patterns for number matching

222

algorithms = [

223

('INT (default)', ns.INT),

224

('FLOAT', ns.FLOAT),

225

('SIGNED', ns.SIGNED),

226

('REAL (FLOAT|SIGNED)', ns.REAL),

227

('FLOAT without exponents', ns.FLOAT | ns.NOEXP)

228

]

229

230

test_string = "item-1.5e+3_version2.10_beta"

231

232

print("Regular expression patterns and matches:")

233

for name, alg in algorithms:

234

pattern = numeric_regex_chooser(alg)

235

matches = re.findall(pattern, test_string)

236

print(f"{name:25}: {pattern}")

237

print(f"{'':25} Matches: {matches}")

238

print()

239

```

240

241

### Command Line Interface Usage

242

243

```python

244

# Examples of using the natsort command-line interface

245

246

# Note: These would be run from the command line, not in Python

247

248

"""

249

# Basic usage - sort lines from a file

250

$ cat data.txt

251

item10

252

item2

253

item1

254

item20

255

256

$ natsort data.txt

257

item1

258

item2

259

item10

260

item20

261

262

# Sort file paths

263

$ natsort --paths folder/file10.txt folder/file2.txt folder/file1.txt

264

folder/file1.txt

265

folder/file2.txt

266

folder/file10.txt

267

268

# Sort with real numbers and reverse order

269

$ echo -e "val-1.5\nval2.3\nval-0.8" | natsort --number-type real --reverse

270

val2.3

271

val-0.8

272

val-1.5

273

274

# Filter by numeric range

275

$ echo -e "item1\nitem25\nitem5\nitem30" | natsort --filter 1 10

276

item1

277

item5

278

279

# Exclude specific numbers

280

$ echo -e "test1\ntest2\ntest3\ntest10" | natsort --exclude 2

281

test1

282

test3

283

test10

284

285

# Locale-aware sorting (results depend on system locale)

286

$ echo -e "café\nnaive\nresume" | natsort --locale

287

"""

288

289

# Programmatic access to CLI functionality

290

from natsort.__main__ import main

291

import sys

292

from io import StringIO

293

294

# Capture stdout to test CLI functionality

295

old_stdout = sys.stdout

296

sys.stdout = captured_output = StringIO()

297

298

try:

299

# Simulate command line arguments

300

main('--number-type', 'real', '--reverse')

301

# Note: This would normally read from stdin

302

except SystemExit:

303

pass # CLI exits normally

304

305

# Restore stdout

306

sys.stdout = old_stdout

307

output = captured_output.getvalue()

308

```

309

310

### Advanced Text Processing Patterns

311

312

```python

313

from natsort import natsorted, chain_functions, as_utf8

314

import unicodedata

315

316

# Complex text processing for international data

317

international_files = [

318

'Résumé_v1.10.pdf',

319

'résumé_v1.2.pdf',

320

'NAÏVE_algorithm_v2.1.txt',

321

'naïve_algorithm_v1.0.txt'

322

]

323

324

# Create a complex processing chain

325

def normalize_unicode(text):

326

"""Normalize unicode to standard form."""

327

return unicodedata.normalize('NFD', text)

328

329

def remove_accents(text):

330

"""Remove accent characters."""

331

return ''.join(c for c in text if unicodedata.category(c) != 'Mn')

332

333

# Chain transformations: decode -> normalize -> remove accents -> lowercase

334

text_processor = chain_functions([

335

as_utf8, # Ensure proper string type

336

normalize_unicode, # Normalize unicode representation

337

remove_accents, # Remove accent marks

338

lambda x: x.lower() # Convert to lowercase

339

])

340

341

# Sort using processed text as key

342

sorted_international = natsorted(international_files, key=text_processor)

343

344

print("Original -> Processed key:")

345

for filename in international_files:

346

processed = text_processor(filename)

347

print(f" {filename} -> {processed}")

348

349

print(f"\nSorted order: {sorted_international}")

350

```

351

352

### Integration with Data Processing Pipelines

353

354

```python

355

from natsort import natsorted, chain_functions

356

import json

357

from pathlib import Path

358

359

# Simulate a data processing pipeline

360

def process_log_files(directory):

361

"""Process log files in natural order."""

362

363

# Get all log files

364

log_files = list(Path(directory).glob('*.log'))

365

366

# Create key function for sorting: filename without extension, naturally

367

filename_key = chain_functions([

368

lambda x: x.stem, # Get filename without extension

369

str.lower # Case-insensitive

370

])

371

372

# Sort files naturally

373

sorted_files = natsorted(log_files, key=filename_key)

374

375

results = []

376

for log_file in sorted_files:

377

# Process each file (simulated)

378

file_info = {

379

'filename': log_file.name,

380

'size': log_file.stat().st_size if log_file.exists() else 0,

381

'processed': True

382

}

383

results.append(file_info)

384

385

return results

386

387

# Example usage (would work with real directory)

388

# results = process_log_files('/var/log/')

389

# print(json.dumps(results, indent=2))

390

```