or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-structures.mdindex.mdio.mdstatistics.mdutilities.md

utilities.mddocs/

0

# Utilities

1

2

Scikit-allel provides essential utilities for data validation, array manipulation, and caching to support genetic analysis workflows.

3

4

## Caching System

5

6

### HDF5 Caching

7

8

Efficiently cache computational results using HDF5 storage.

9

10

```python

11

import allel

12

import numpy as np

13

14

# Use HDF5 cache decorator for functions

15

@allel.hdf5_cache(filepath='analysis_cache.h5', group='/results')

16

def expensive_calculation(genotype_data):

17

g = allel.GenotypeArray(genotype_data)

18

ac = g.count_alleles()

19

return allel.sequence_diversity(positions, ac)

20

21

# Function results are automatically cached

22

result = expensive_calculation(my_genotype_data)

23

```

24

{ .api }

25

26

```python

27

def hdf5_cache(filepath=None, parent=None, group=None, names=None, typed=False,

28

hashed_key=False, **h5dcreate_kwargs):

29

"""

30

HDF5 cache decorator for function results.

31

32

Args:

33

filepath (str, optional): Path to HDF5 file for caching

34

parent (str, optional): Parent group path within HDF5 file

35

group (str, optional): Specific group name for cache

36

names (list, optional): Names for cached results

37

typed (bool): Whether to consider argument types in cache key

38

hashed_key (bool): Whether to hash the cache key

39

**h5dcreate_kwargs: Additional HDF5 dataset creation arguments

40

41

Returns:

42

decorator: Function decorator that caches results

43

"""

44

```

45

{ .api }

46

47

## Array Validation and Conversion

48

49

### Core Validation Functions

50

51

Utilities for validating and converting genetic data arrays.

52

53

```python

54

import allel

55

import numpy as np

56

57

# Ensure array has correct dimensions

58

array_2d = allel.asarray_ndim(data, 2)

59

array_3d = allel.asarray_ndim(genotypes, 3)

60

61

# Validate array properties

62

allel.check_ndim(array, 2) # Raises error if wrong dimensions

63

allel.check_shape(array, (100, 4)) # Check specific shape

64

allel.check_dtype(array, int) # Validate data type

65

allel.check_integer_dtype(array) # Check if integer type

66

allel.check_ploidy(2, 'genotypes') # Validate ploidy parameter

67

68

# Check array alignment

69

allel.check_dim0_aligned(variants_array, genotypes_array)

70

allel.check_dim1_aligned(samples_array, genotypes_array)

71

```

72

{ .api }

73

74

**Key Validation Functions:**

75

76

```python

77

def asarray_ndim(a, *ndims, **kwargs):

78

"""

79

Ensure array has one of the specified numbers of dimensions.

80

81

Args:

82

a (array_like): Input array

83

*ndims (int): Allowed numbers of dimensions

84

**kwargs: Additional arguments for array conversion

85

86

Returns:

87

numpy.ndarray: Array with validated dimensionality

88

89

Raises:

90

TypeError: If array has wrong number of dimensions

91

"""

92

93

def check_ndim(a, ndim):

94

"""

95

Check that array has expected number of dimensions.

96

97

Args:

98

a (array_like): Array to check

99

ndim (int): Expected number of dimensions

100

101

Raises:

102

TypeError: If array has wrong number of dimensions

103

"""

104

105

def check_shape(a, shape):

106

"""

107

Check that array has expected shape.

108

109

Args:

110

a (array_like): Array to check

111

shape (tuple): Expected shape

112

113

Raises:

114

TypeError: If array shape doesn't match

115

"""

116

117

def check_dtype(a, *dtypes):

118

"""

119

Check that array has one of the expected data types.

120

121

Args:

122

a (array_like): Array to check

123

*dtypes: Expected data types

124

125

Raises:

126

TypeError: If array has wrong data type

127

"""

128

129

def check_integer_dtype(a):

130

"""

131

Check that array has integer data type.

132

133

Args:

134

a (array_like): Array to check

135

136

Raises:

137

TypeError: If array is not integer type

138

"""

139

140

def check_ploidy(ploidy, name):

141

"""

142

Validate ploidy parameter.

143

144

Args:

145

ploidy (int): Ploidy value to validate

146

name (str): Name of the data for error messages

147

148

Raises:

149

ValueError: If ploidy is invalid

150

"""

151

152

def check_dim0_aligned(*arrays):

153

"""

154

Check that multiple arrays are aligned on first dimension.

155

156

Args:

157

*arrays: Variable number of arrays to check

158

159

Raises:

160

ValueError: If arrays have different sizes in first dimension

161

"""

162

163

def check_dim1_aligned(*arrays):

164

"""

165

Check that multiple arrays are aligned on second dimension.

166

167

Args:

168

*arrays: Variable number of arrays to check

169

170

Raises:

171

ValueError: If arrays have different sizes in second dimension

172

"""

173

```

174

{ .api }

175

176

## Error Handling Utilities

177

178

### Numerical Computation Support

179

180

Utilities for handling numerical edge cases in genetic calculations.

181

182

```python

183

# Handle invalid operations (e.g., division by zero)

184

with allel.ignore_invalid():

185

# Calculations that might produce NaN/inf values

186

frequencies = allele_counts / total_counts

187

diversity = frequencies * (1 - frequencies)

188

```

189

{ .api }

190

191

```python

192

def ignore_invalid():

193

"""

194

Context manager to temporarily ignore numpy invalid value warnings.

195

196

Useful for genetic calculations that may involve division by zero

197

or other operations that produce NaN values as expected behavior.

198

199

Returns:

200

context manager: Context that ignores numpy invalid warnings

201

"""

202

```

203

{ .api }

204

205

## Integration with Scientific Python

206

207

### Working with NumPy Arrays

208

209

Scikit-allel arrays are built on NumPy and integrate seamlessly with the scientific Python ecosystem.

210

211

```python

212

import numpy as np

213

import allel

214

215

# Convert genetic arrays to NumPy for custom operations

216

g = allel.GenotypeArray(genotype_data)

217

numpy_array = np.array(g)

218

219

# Use NumPy functions directly

220

missing_mask = (numpy_array == -1)

221

mean_genotype = np.nanmean(numpy_array, axis=1)

222

223

# Statistical operations

224

allele_variance = np.var(numpy_array, axis=1)

225

sample_correlations = np.corrcoef(numpy_array.T)

226

```

227

{ .api }

228

229

### Integration Examples

230

231

```python

232

import pandas as pd

233

import matplotlib.pyplot as plt

234

235

# Create analysis workflows

236

def basic_qc_workflow(vcf_file):

237

"""Basic quality control workflow."""

238

# Read data

239

variants, samples, calldata = allel.read_vcf(vcf_file)

240

g = allel.GenotypeArray(calldata['GT'])

241

242

# Calculate statistics

243

ac = g.count_alleles()

244

missing_rate = np.mean(g.is_missing(), axis=1)

245

het_obs = allel.heterozygosity_observed(g)

246

247

# Create summary DataFrame

248

qc_stats = pd.DataFrame({

249

'CHROM': variants['CHROM'],

250

'POS': variants['POS'],

251

'missing_rate': missing_rate,

252

'het_obs': het_obs,

253

'n_alleles': ac.allelism()

254

})

255

256

return qc_stats

257

258

def plot_diversity_along_chromosome(positions, diversity_values):

259

"""Plot diversity metrics along chromosome."""

260

plt.figure(figsize=(12, 6))

261

plt.plot(positions, diversity_values, alpha=0.7)

262

plt.xlabel('Genomic Position')

263

plt.ylabel('Nucleotide Diversity (π)')

264

plt.title('Genetic Diversity Along Chromosome')

265

plt.grid(True, alpha=0.3)

266

return plt.gcf()

267

```

268

{ .api }

269

270

## Constants

271

272

### Genetic Analysis Constants

273

274

```python

275

# Standard genetic constants used throughout scikit-allel

276

DIPLOID_PLOIDY = 2 # Standard diploid ploidy level

277

278

# Usage in validation

279

def validate_diploid_genotypes(g):

280

"""Validate that genotypes are diploid."""

281

assert g.ploidy == allel.DIPLOID_PLOIDY

282

return True

283

```

284

{ .api }