or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli-commands.mdcomparison.mdconsensus.mdformat-conversion.mdgenbank-tbl.mdgff-processing.mdindex.mdsequence-operations.mdutilities.md

utilities.mddocs/

0

# Utilities and Validation

1

2

Comprehensive file handling utilities, data validation functions, and annotation statistics calculation with support for compressed formats, flexible I/O operations, and robust error handling.

3

4

## Capabilities

5

6

### File I/O Operations

7

8

Advanced file handling with automatic compression detection and support for various formats.

9

10

```python { .api }

11

def zopen(filename, mode="r", buff=1024*1024, external=PARALLEL):

12

"""

13

Open files with automatic compression support.

14

15

Parameters:

16

- filename (str): Path to file (supports .gz, .bz2, .xz)

17

- mode (str): File opening mode ("r", "w", "a")

18

- buff (int): Buffer size for reading

19

- external (int): Compression tool selection (NORMAL=0, PROCESS=1, PARALLEL=2)

20

21

Returns:

22

file-like: File handle for reading/writing

23

"""

24

25

def open_pipe(command, mode="r", buff=1024*1024):

26

"""

27

Open command as pipe for reading/writing.

28

29

Parameters:

30

- command (str): Shell command to execute

31

- mode (str): Pipe mode ("r" or "w")

32

- buff (int): Buffer size

33

34

Returns:

35

file-like: Pipe handle

36

"""

37

38

def open_gz(filename, mode="r", buff=1024*1024, external=PARALLEL):

39

"""

40

Open gzipped files with optional external tool support.

41

42

Parameters:

43

- filename (str): Path to .gz file

44

- mode (str): File opening mode

45

- buff (int): Buffer size

46

- external (int): Compression tool selection (NORMAL=0, PROCESS=1, PARALLEL=2)

47

48

Returns:

49

file-like: Gzipped file handle

50

"""

51

52

def open_bz2(filename, mode="r", buff=1024*1024, external=PARALLEL):

53

"""

54

Open bz2 compressed files.

55

56

Parameters:

57

- filename (str): Path to .bz2 file

58

- mode (str): File opening mode

59

- buff (int): Buffer size

60

- external (int): Compression tool selection (NORMAL=0, PROCESS=1, PARALLEL=2)

61

62

Returns:

63

file-like: Bz2 file handle

64

"""

65

66

def open_xz(filename, mode="r", buff=1024*1024, external=PARALLEL):

67

"""

68

Open xz compressed files.

69

70

Parameters:

71

- filename (str): Path to .xz file

72

- mode (str): File opening mode

73

- buff (int): Buffer size

74

- external (int): Compression tool selection (NORMAL=0, PROCESS=1, PARALLEL=2)

75

76

Returns:

77

file-like: Xz file handle

78

"""

79

```

80

81

### File Validation

82

83

Validate file existence, format, and properties before processing.

84

85

```python { .api }

86

def check_inputs(inputs):

87

"""

88

Validate that input files exist and are accessible.

89

90

Parameters:

91

- inputs (list): List of file paths to check

92

93

Returns:

94

bool: True if all files exist, raises exception otherwise

95

"""

96

97

def is_file(f):

98

"""

99

Check if file exists and is readable.

100

101

Parameters:

102

- f (str): File path to check

103

104

Returns:

105

bool: True if file exists and is readable

106

"""

107

108

def is_gzipped(filepath):

109

"""

110

Check if file is gzipped by examining magic bytes.

111

112

Parameters:

113

- filepath (str): Path to file

114

115

Returns:

116

bool: True if file is gzipped

117

"""

118

119

def is_text_file(filepath):

120

"""

121

Check if file contains text data.

122

123

Parameters:

124

- filepath (str): Path to file

125

126

Returns:

127

bool: True if file appears to be text

128

"""

129

130

def check_file_type(filepath):

131

"""

132

Determine file type (text/gzipped/binary).

133

134

Parameters:

135

- filepath (str): Path to file

136

137

Returns:

138

str: File type ("text", "gzipped", "binary")

139

"""

140

```

141

142

### System Utilities

143

144

System-level utilities for program discovery and path resolution.

145

146

```python { .api }

147

def which2(program):

148

"""

149

Find program executable in system PATH.

150

151

Parameters:

152

- program (str): Program name to search for

153

154

Returns:

155

str|None: Full path to executable or None if not found

156

"""

157

```

158

159

### Data Processing

160

161

Process and filter annotation data using flexible patterns.

162

163

```python { .api }

164

def filter_annotations(annotations, grep=None, grepv=None):

165

"""

166

Filter annotation dictionary using regex patterns.

167

168

Parameters:

169

- annotations (dict): Annotation dictionary to filter

170

- grep (list): Patterns to keep (include matches)

171

- grepv (list): Patterns to exclude (remove matches)

172

173

Returns:

174

dict: Filtered annotation dictionary

175

"""

176

177

def readBlocks(source, pattern):

178

"""

179

Read file in blocks separated by pattern.

180

181

Parameters:

182

- source (str): File path or file handle

183

- pattern (str): Regex pattern for block separation

184

185

Yields:

186

str: Text blocks between pattern matches

187

"""

188

189

def readBlocks2(source, startpattern, endpattern):

190

"""

191

Read file in blocks defined by start and end patterns.

192

193

Parameters:

194

- source (str): File path or file handle

195

- startpattern (str): Regex pattern for block start

196

- endpattern (str): Regex pattern for block end

197

198

Yields:

199

str: Text blocks between start and end patterns

200

"""

201

```

202

203

### Annotation Statistics

204

205

Calculate comprehensive statistics from annotation data.

206

207

```python { .api }

208

def annotation_stats(Genes):

209

"""

210

Calculate comprehensive annotation statistics.

211

212

Parameters:

213

- Genes (dict): Annotation dictionary to analyze

214

215

Returns:

216

dict: Statistics including:

217

- gene_count: Total number of genes

218

- transcript_count: Total number of transcripts

219

- avg_transcripts_per_gene: Average transcripts per gene

220

- protein_coding_genes: Number of protein-coding genes

221

- functional_annotation_counts: GO terms, EC numbers, etc.

222

- exon_statistics: Average exon counts and lengths

223

- intron_statistics: Average intron counts and lengths

224

- strand_distribution: Plus/minus strand counts

225

- contig_distribution: Genes per chromosome/contig

226

"""

227

```

228

229

### Constants

230

231

File opening mode constants for different compression handling approaches.

232

233

```python { .api }

234

NORMAL = 0 # Standard file opening

235

PROCESS = 1 # Process-based file opening

236

PARALLEL = 2 # Parallel file processing mode

237

```

238

239

## Usage Examples

240

241

### File Operations

242

243

```python

244

from gfftk.utils import zopen, check_inputs, is_gzipped

245

246

# Check files before processing

247

input_files = ["annotation.gff3", "genome.fasta.gz", "proteins.faa"]

248

if check_inputs(input_files):

249

print("All input files found")

250

251

# Open files with automatic compression handling

252

with zopen("large_annotation.gff3.gz", "r") as f:

253

for line in f:

254

if line.startswith("##"):

255

continue

256

# Process GFF3 lines

257

258

# Check file properties

259

if is_gzipped("genome.fasta.gz"):

260

print("Genome file is compressed")

261

```

262

263

### Annotation Filtering

264

265

```python

266

from gfftk.utils import filter_annotations

267

from gfftk.gff import gff2dict

268

269

# Load annotation

270

annotation = gff2dict("annotation.gff3", "genome.fasta")

271

272

# Filter for kinase genes (case-insensitive)

273

kinases = filter_annotations(

274

annotation,

275

grep=["product:kinase:i"]

276

)

277

278

# Remove pseudogenes and keep only protein-coding

279

filtered = filter_annotations(

280

annotation,

281

grep=["type:mRNA"],

282

grepv=["product:pseudogene", "note:partial"]

283

)

284

285

print(f"Found {len(kinases)} kinase genes")

286

print(f"Filtered to {len(filtered)} protein-coding genes")

287

```

288

289

### Statistics Calculation

290

291

```python

292

from gfftk.utils import annotation_stats

293

from gfftk.gff import gff2dict

294

295

# Load annotation data

296

annotation = gff2dict("annotation.gff3", "genome.fasta")

297

298

# Calculate comprehensive statistics

299

stats = annotation_stats(annotation)

300

301

print(f"Genome Annotation Statistics:")

302

print(f"Total genes: {stats['gene_count']}")

303

print(f"Total transcripts: {stats['transcript_count']}")

304

print(f"Avg transcripts per gene: {stats['avg_transcripts_per_gene']:.2f}")

305

print(f"Protein-coding genes: {stats['protein_coding_genes']}")

306

307

if 'functional_annotation_counts' in stats:

308

func_stats = stats['functional_annotation_counts']

309

print(f"Genes with GO terms: {func_stats.get('go_terms', 0)}")

310

print(f"Genes with EC numbers: {func_stats.get('ec_numbers', 0)}")

311

312

if 'strand_distribution' in stats:

313

strand_stats = stats['strand_distribution']

314

print(f"Plus strand genes: {strand_stats.get('+', 0)}")

315

print(f"Minus strand genes: {strand_stats.get('-', 0)}")

316

```

317

318

### Block Reading

319

320

```python

321

from gfftk.utils import readBlocks, readBlocks2

322

323

# Read FASTA file by sequences

324

for sequence_block in readBlocks("genome.fasta", r"^>"):

325

lines = sequence_block.strip().split('\n')

326

if lines:

327

header = lines[0]

328

sequence = ''.join(lines[1:])

329

print(f"Sequence: {header}, Length: {len(sequence)}")

330

331

# Read structured file with start/end markers

332

for block in readBlocks2("structured_data.txt", r"^START", r"^END"):

333

# Process data between START and END markers

334

process_data_block(block)

335

```

336

337

### System Integration

338

339

```python

340

from gfftk.utils import which2, open_pipe

341

342

# Check for external tools

343

if which2("blastp"):

344

print("BLAST+ is available")

345

346

if which2("diamond"):

347

print("Diamond is available for faster searches")

348

349

# Use external tools via pipes

350

with open_pipe("grep '^>' genome.fasta | wc -l", "r") as p:

351

sequence_count = int(p.read().strip())

352

print(f"Genome has {sequence_count} sequences")

353

```

354

355

### Comprehensive File Processing Pipeline

356

357

```python

358

from gfftk.utils import zopen, filter_annotations, annotation_stats

359

from gfftk.gff import gff2dict

360

import os

361

362

def process_annotation_files(input_dir, output_dir, filters=None):

363

"""Process multiple annotation files with filtering and statistics."""

364

365

os.makedirs(output_dir, exist_ok=True)

366

results = {}

367

368

for filename in os.listdir(input_dir):

369

if filename.endswith(('.gff3', '.gff3.gz')):

370

print(f"Processing {filename}...")

371

372

# Load annotation

373

input_path = os.path.join(input_dir, filename)

374

genome_path = os.path.join(input_dir, "genome.fasta")

375

376

annotation = gff2dict(input_path, genome_path)

377

378

# Apply filters if provided

379

if filters:

380

annotation = filter_annotations(

381

annotation,

382

grep=filters.get('grep'),

383

grepv=filters.get('grepv')

384

)

385

386

# Calculate statistics

387

stats = annotation_stats(annotation)

388

389

# Write filtered annotation

390

base_name = filename.replace('.gz', '').replace('.gff3', '')

391

output_path = os.path.join(output_dir, f"{base_name}_filtered.gff3")

392

393

from gfftk.gff import dict2gff3

394

dict2gff3(annotation, output=output_path)

395

396

results[filename] = {

397

'stats': stats,

398

'output_file': output_path

399

}

400

401

return results

402

403

# Example usage

404

filters = {

405

'grep': ['type:mRNA'], # Keep only mRNA features

406

'grepv': ['product:hypothetical'] # Remove hypothetical proteins

407

}

408

409

results = process_annotation_files(

410

input_dir="raw_annotations/",

411

output_dir="filtered_annotations/",

412

filters=filters

413

)

414

```

415

416

## Types

417

418

```python { .api }

419

# File opening modes

420

FileOpeningMode = int # NORMAL, PROCESS, or PARALLEL

421

422

# File type detection result

423

FileType = str # "text", "gzipped", "binary"

424

425

# Filter pattern for annotations

426

FilterPattern = str # Format: "key:pattern" or "key:pattern:flags"

427

428

# Annotation statistics structure

429

AnnotationStats = {

430

"gene_count": int,

431

"transcript_count": int,

432

"avg_transcripts_per_gene": float,

433

"protein_coding_genes": int,

434

"pseudogenes": int,

435

"functional_annotation_counts": dict,

436

"exon_statistics": dict,

437

"intron_statistics": dict,

438

"strand_distribution": dict,

439

"contig_distribution": dict,

440

"length_statistics": dict

441

}

442

443

# Functional annotation counts

444

FunctionalStats = {

445

"go_terms": int,

446

"ec_numbers": int,

447

"db_xrefs": int,

448

"product_descriptions": int,

449

"gene_names": int

450

}

451

452

# Structural statistics

453

StructuralStats = {

454

"avg_exons_per_transcript": float,

455

"avg_exon_length": float,

456

"avg_introns_per_transcript": float,

457

"avg_intron_length": float,

458

"avg_cds_length": float,

459

"avg_protein_length": float

460

}

461

462

# Block reading generator type

463

BlockGenerator = Iterator[str] # Generator yielding text blocks

464

```