Tessl Tile for pypi/gfftk@25.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cli-commands.md comparison.md consensus.md format-conversion.md genbank-tbl.md gff-processing.md index.md sequence-operations.md utilities.md

utilities.mddocs/

0
# Utilities and Validation
1

2
Comprehensive file handling utilities, data validation functions, and annotation statistics calculation with support for compressed formats, flexible I/O operations, and robust error handling.
3

4
## Capabilities
5

6
### File I/O Operations
7

8
Advanced file handling with automatic compression detection and support for various formats.
9

10
```python { .api }
11
def zopen(filename, mode="r", buff=1024*1024, external=PARALLEL):
12
    """
13
    Open files with automatic compression support.
14

15
    Parameters:
16
    - filename (str): Path to file (supports .gz, .bz2, .xz)
17
    - mode (str): File opening mode ("r", "w", "a")
18
    - buff (int): Buffer size for reading
19
    - external (int): Compression tool selection (NORMAL=0, PROCESS=1, PARALLEL=2)
20

21
    Returns:
22
    file-like: File handle for reading/writing
23
    """
24

25
def open_pipe(command, mode="r", buff=1024*1024):
26
    """
27
    Open command as pipe for reading/writing.
28

29
    Parameters:
30
    - command (str): Shell command to execute
31
    - mode (str): Pipe mode ("r" or "w")
32
    - buff (int): Buffer size
33

34
    Returns:
35
    file-like: Pipe handle
36
    """
37

38
def open_gz(filename, mode="r", buff=1024*1024, external=PARALLEL):
39
    """
40
    Open gzipped files with optional external tool support.
41

42
    Parameters:
43
    - filename (str): Path to .gz file
44
    - mode (str): File opening mode
45
    - buff (int): Buffer size
46
    - external (int): Compression tool selection (NORMAL=0, PROCESS=1, PARALLEL=2)
47

48
    Returns:
49
    file-like: Gzipped file handle
50
    """
51

52
def open_bz2(filename, mode="r", buff=1024*1024, external=PARALLEL):
53
    """
54
    Open bz2 compressed files.
55

56
    Parameters:
57
    - filename (str): Path to .bz2 file
58
    - mode (str): File opening mode
59
    - buff (int): Buffer size
60
    - external (int): Compression tool selection (NORMAL=0, PROCESS=1, PARALLEL=2)
61

62
    Returns:
63
    file-like: Bz2 file handle
64
    """
65

66
def open_xz(filename, mode="r", buff=1024*1024, external=PARALLEL):
67
    """
68
    Open xz compressed files.
69

70
    Parameters:
71
    - filename (str): Path to .xz file
72
    - mode (str): File opening mode
73
    - buff (int): Buffer size
74
    - external (int): Compression tool selection (NORMAL=0, PROCESS=1, PARALLEL=2)
75

76
    Returns:
77
    file-like: Xz file handle
78
    """
79
```
80

81
### File Validation
82

83
Validate file existence, format, and properties before processing.
84

85
```python { .api }
86
def check_inputs(inputs):
87
    """
88
    Validate that input files exist and are accessible.
89

90
    Parameters:
91
    - inputs (list): List of file paths to check
92

93
    Returns:
94
    bool: True if all files exist, raises exception otherwise
95
    """
96

97
def is_file(f):
98
    """
99
    Check if file exists and is readable.
100

101
    Parameters:
102
    - f (str): File path to check
103

104
    Returns:
105
    bool: True if file exists and is readable
106
    """
107

108
def is_gzipped(filepath):
109
    """
110
    Check if file is gzipped by examining magic bytes.
111

112
    Parameters:
113
    - filepath (str): Path to file
114

115
    Returns:
116
    bool: True if file is gzipped
117
    """
118

119
def is_text_file(filepath):
120
    """
121
    Check if file contains text data.
122

123
    Parameters:
124
    - filepath (str): Path to file
125

126
    Returns:
127
    bool: True if file appears to be text
128
    """
129

130
def check_file_type(filepath):
131
    """
132
    Determine file type (text/gzipped/binary).
133

134
    Parameters:
135
    - filepath (str): Path to file
136

137
    Returns:
138
    str: File type ("text", "gzipped", "binary")
139
    """
140
```
141

142
### System Utilities
143

144
System-level utilities for program discovery and path resolution.
145

146
```python { .api }
147
def which2(program):
148
    """
149
    Find program executable in system PATH.
150

151
    Parameters:
152
    - program (str): Program name to search for
153

154
    Returns:
155
    str|None: Full path to executable or None if not found
156
    """
157
```
158

159
### Data Processing
160

161
Process and filter annotation data using flexible patterns.
162

163
```python { .api }
164
def filter_annotations(annotations, grep=None, grepv=None):
165
    """
166
    Filter annotation dictionary using regex patterns.
167

168
    Parameters:
169
    - annotations (dict): Annotation dictionary to filter
170
    - grep (list): Patterns to keep (include matches)
171
    - grepv (list): Patterns to exclude (remove matches)
172

173
    Returns:
174
    dict: Filtered annotation dictionary
175
    """
176

177
def readBlocks(source, pattern):
178
    """
179
    Read file in blocks separated by pattern.
180

181
    Parameters:
182
    - source (str): File path or file handle
183
    - pattern (str): Regex pattern for block separation
184

185
    Yields:
186
    str: Text blocks between pattern matches
187
    """
188

189
def readBlocks2(source, startpattern, endpattern):
190
    """
191
    Read file in blocks defined by start and end patterns.
192

193
    Parameters:
194
    - source (str): File path or file handle
195
    - startpattern (str): Regex pattern for block start
196
    - endpattern (str): Regex pattern for block end
197

198
    Yields:
199
    str: Text blocks between start and end patterns
200
    """
201
```
202

203
### Annotation Statistics
204

205
Calculate comprehensive statistics from annotation data.
206

207
```python { .api }
208
def annotation_stats(Genes):
209
    """
210
    Calculate comprehensive annotation statistics.
211

212
    Parameters:
213
    - Genes (dict): Annotation dictionary to analyze
214

215
    Returns:
216
    dict: Statistics including:
217
        - gene_count: Total number of genes
218
        - transcript_count: Total number of transcripts
219
        - avg_transcripts_per_gene: Average transcripts per gene
220
        - protein_coding_genes: Number of protein-coding genes
221
        - functional_annotation_counts: GO terms, EC numbers, etc.
222
        - exon_statistics: Average exon counts and lengths
223
        - intron_statistics: Average intron counts and lengths
224
        - strand_distribution: Plus/minus strand counts
225
        - contig_distribution: Genes per chromosome/contig
226
    """
227
```
228

229
### Constants
230

231
File opening mode constants for different compression handling approaches.
232

233
```python { .api }
234
NORMAL = 0      # Standard file opening
235
PROCESS = 1     # Process-based file opening
236
PARALLEL = 2    # Parallel file processing mode
237
```
238

239
## Usage Examples
240

241
### File Operations
242

243
```python
244
from gfftk.utils import zopen, check_inputs, is_gzipped
245

246
# Check files before processing
247
input_files = ["annotation.gff3", "genome.fasta.gz", "proteins.faa"]
248
if check_inputs(input_files):
249
    print("All input files found")
250

251
# Open files with automatic compression handling
252
with zopen("large_annotation.gff3.gz", "r") as f:
253
    for line in f:
254
        if line.startswith("##"):
255
            continue
256
        # Process GFF3 lines
257

258
# Check file properties
259
if is_gzipped("genome.fasta.gz"):
260
    print("Genome file is compressed")
261
```
262

263
### Annotation Filtering
264

265
```python
266
from gfftk.utils import filter_annotations
267
from gfftk.gff import gff2dict
268

269
# Load annotation
270
annotation = gff2dict("annotation.gff3", "genome.fasta")
271

272
# Filter for kinase genes (case-insensitive)
273
kinases = filter_annotations(
274
    annotation,
275
    grep=["product:kinase:i"]
276
)
277

278
# Remove pseudogenes and keep only protein-coding
279
filtered = filter_annotations(
280
    annotation,
281
    grep=["type:mRNA"],
282
    grepv=["product:pseudogene", "note:partial"]
283
)
284

285
print(f"Found {len(kinases)} kinase genes")
286
print(f"Filtered to {len(filtered)} protein-coding genes")
287
```
288

289
### Statistics Calculation
290

291
```python
292
from gfftk.utils import annotation_stats
293
from gfftk.gff import gff2dict
294

295
# Load annotation data
296
annotation = gff2dict("annotation.gff3", "genome.fasta")
297

298
# Calculate comprehensive statistics
299
stats = annotation_stats(annotation)
300

301
print(f"Genome Annotation Statistics:")
302
print(f"Total genes: {stats['gene_count']}")
303
print(f"Total transcripts: {stats['transcript_count']}")
304
print(f"Avg transcripts per gene: {stats['avg_transcripts_per_gene']:.2f}")
305
print(f"Protein-coding genes: {stats['protein_coding_genes']}")
306

307
if 'functional_annotation_counts' in stats:
308
    func_stats = stats['functional_annotation_counts']
309
    print(f"Genes with GO terms: {func_stats.get('go_terms', 0)}")
310
    print(f"Genes with EC numbers: {func_stats.get('ec_numbers', 0)}")
311

312
if 'strand_distribution' in stats:
313
    strand_stats = stats['strand_distribution']
314
    print(f"Plus strand genes: {strand_stats.get('+', 0)}")
315
    print(f"Minus strand genes: {strand_stats.get('-', 0)}")
316
```
317

318
### Block Reading
319

320
```python
321
from gfftk.utils import readBlocks, readBlocks2
322

323
# Read FASTA file by sequences
324
for sequence_block in readBlocks("genome.fasta", r"^>"):
325
    lines = sequence_block.strip().split('\n')
326
    if lines:
327
        header = lines[0]
328
        sequence = ''.join(lines[1:])
329
        print(f"Sequence: {header}, Length: {len(sequence)}")
330

331
# Read structured file with start/end markers
332
for block in readBlocks2("structured_data.txt", r"^START", r"^END"):
333
    # Process data between START and END markers
334
    process_data_block(block)
335
```
336

337
### System Integration
338

339
```python
340
from gfftk.utils import which2, open_pipe
341

342
# Check for external tools
343
if which2("blastp"):
344
    print("BLAST+ is available")
345

346
if which2("diamond"):
347
    print("Diamond is available for faster searches")
348

349
# Use external tools via pipes
350
with open_pipe("grep '^>' genome.fasta | wc -l", "r") as p:
351
    sequence_count = int(p.read().strip())
352
    print(f"Genome has {sequence_count} sequences")
353
```
354

355
### Comprehensive File Processing Pipeline
356

357
```python
358
from gfftk.utils import zopen, filter_annotations, annotation_stats
359
from gfftk.gff import gff2dict
360
import os
361

362
def process_annotation_files(input_dir, output_dir, filters=None):
363
    """Process multiple annotation files with filtering and statistics."""
364

365
    os.makedirs(output_dir, exist_ok=True)
366
    results = {}
367

368
    for filename in os.listdir(input_dir):
369
        if filename.endswith(('.gff3', '.gff3.gz')):
370
            print(f"Processing {filename}...")
371

372
            # Load annotation
373
            input_path = os.path.join(input_dir, filename)
374
            genome_path = os.path.join(input_dir, "genome.fasta")
375

376
            annotation = gff2dict(input_path, genome_path)
377

378
            # Apply filters if provided
379
            if filters:
380
                annotation = filter_annotations(
381
                    annotation,
382
                    grep=filters.get('grep'),
383
                    grepv=filters.get('grepv')
384
                )
385

386
            # Calculate statistics
387
            stats = annotation_stats(annotation)
388

389
            # Write filtered annotation
390
            base_name = filename.replace('.gz', '').replace('.gff3', '')
391
            output_path = os.path.join(output_dir, f"{base_name}_filtered.gff3")
392

393
            from gfftk.gff import dict2gff3
394
            dict2gff3(annotation, output=output_path)
395

396
            results[filename] = {
397
                'stats': stats,
398
                'output_file': output_path
399
            }
400

401
    return results
402

403
# Example usage
404
filters = {
405
    'grep': ['type:mRNA'],           # Keep only mRNA features
406
    'grepv': ['product:hypothetical'] # Remove hypothetical proteins
407
}
408

409
results = process_annotation_files(
410
    input_dir="raw_annotations/",
411
    output_dir="filtered_annotations/",
412
    filters=filters
413
)
414
```
415

416
## Types
417

418
```python { .api }
419
# File opening modes
420
FileOpeningMode = int  # NORMAL, PROCESS, or PARALLEL
421

422
# File type detection result
423
FileType = str  # "text", "gzipped", "binary"
424

425
# Filter pattern for annotations
426
FilterPattern = str  # Format: "key:pattern" or "key:pattern:flags"
427

428
# Annotation statistics structure
429
AnnotationStats = {
430
    "gene_count": int,
431
    "transcript_count": int,
432
    "avg_transcripts_per_gene": float,
433
    "protein_coding_genes": int,
434
    "pseudogenes": int,
435
    "functional_annotation_counts": dict,
436
    "exon_statistics": dict,
437
    "intron_statistics": dict,
438
    "strand_distribution": dict,
439
    "contig_distribution": dict,
440
    "length_statistics": dict
441
}
442

443
# Functional annotation counts
444
FunctionalStats = {
445
    "go_terms": int,
446
    "ec_numbers": int,
447
    "db_xrefs": int,
448
    "product_descriptions": int,
449
    "gene_names": int
450
}
451

452
# Structural statistics
453
StructuralStats = {
454
    "avg_exons_per_transcript": float,
455
    "avg_exon_length": float,
456
    "avg_introns_per_transcript": float,
457
    "avg_intron_length": float,
458
    "avg_cds_length": float,
459
    "avg_protein_length": float
460
}
461

462
# Block reading generator type
463
BlockGenerator = Iterator[str]  # Generator yielding text blocks
464
```

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/