or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-peptide-operations.mdadvanced-spectral-libraries.mdchemical-constants.mdfragment-ions.mdindex.mdio-utilities.mdprotein-analysis.mdpsm-readers.mdquantification.mdsmiles-chemistry.mdspectral-libraries.md

io-utilities.mddocs/

0

# I/O Utilities

1

2

Advanced I/O utilities including HDF5 wrapper with attribute-style access and memory-mapped arrays for efficient handling of large proteomics datasets. Optimized for high-throughput workflows, memory efficiency, and seamless integration with pandas and numpy operations.

3

4

## Capabilities

5

6

### HDF5 File Interface

7

8

Comprehensive HDF5 wrapper providing attribute-style access and pandas integration for proteomics data storage.

9

10

```python { .api }

11

class HDF_File:

12

"""Main HDF5 file wrapper with comprehensive read/write functionality."""

13

14

def __init__(self, filepath: str, mode: str = 'r', **kwargs):

15

"""

16

Initialize HDF5 file wrapper.

17

18

Parameters:

19

- filepath: Path to HDF5 file

20

- mode: File access mode ('r', 'w', 'a', 'r+')

21

- **kwargs: Additional h5py.File options

22

"""

23

24

def __getitem__(self, key: str):

25

"""

26

Access datasets and groups using dictionary-style syntax.

27

28

Parameters:

29

- key: Dataset or group path

30

31

Returns:

32

HDF_Dataset, HDF_Group, or HDF_Dataframe object

33

"""

34

35

def __setitem__(self, key: str, value):

36

"""

37

Create or update datasets using dictionary-style syntax.

38

39

Parameters:

40

- key: Dataset path

41

- value: Data to store (numpy array, pandas DataFrame, etc.)

42

"""

43

44

def __contains__(self, key: str) -> bool:

45

"""Check if dataset or group exists in file."""

46

47

def __enter__(self):

48

"""Context manager entry."""

49

return self

50

51

def __exit__(self, exc_type, exc_val, exc_tb):

52

"""Context manager exit with automatic file closing."""

53

54

def close(self) -> None:

55

"""Close HDF5 file."""

56

57

def keys(self) -> list:

58

"""Get list of top-level datasets and groups."""

59

60

def create_group(self, name: str) -> 'HDF_Group':

61

"""

62

Create new HDF5 group.

63

64

Parameters:

65

- name: Group name/path

66

67

Returns:

68

HDF_Group wrapper object

69

"""

70

71

def require_group(self, name: str) -> 'HDF_Group':

72

"""

73

Get existing group or create if it doesn't exist.

74

75

Parameters:

76

- name: Group name/path

77

78

Returns:

79

HDF_Group wrapper object

80

"""

81

82

class HDF_Group:

83

"""HDF group wrapper with attribute-style access."""

84

85

def __init__(self, hdf_group):

86

"""Initialize from h5py Group object."""

87

88

def __getitem__(self, key: str):

89

"""Access group contents using dictionary-style syntax."""

90

91

def __setitem__(self, key: str, value):

92

"""Create datasets in group using dictionary-style syntax."""

93

94

def __getattr__(self, name: str):

95

"""Access group contents using attribute-style syntax."""

96

97

def __setattr__(self, name: str, value):

98

"""Create datasets using attribute-style syntax."""

99

100

def keys(self) -> list:

101

"""Get list of datasets and subgroups."""

102

103

def create_dataset(self, name: str, data=None, **kwargs):

104

"""

105

Create dataset in group.

106

107

Parameters:

108

- name: Dataset name

109

- data: Data to store

110

- **kwargs: Dataset creation options

111

"""

112

113

class HDF_Dataset:

114

"""HDF dataset wrapper with NumPy-like interface."""

115

116

def __init__(self, hdf_dataset):

117

"""Initialize from h5py Dataset object."""

118

119

def __getitem__(self, key):

120

"""NumPy-style array indexing."""

121

122

def __setitem__(self, key, value):

123

"""NumPy-style array assignment."""

124

125

def __array__(self) -> np.ndarray:

126

"""Convert to numpy array."""

127

128

@property

129

def shape(self) -> tuple:

130

"""Dataset shape."""

131

132

@property

133

def dtype(self):

134

"""Dataset data type."""

135

136

@property

137

def size(self) -> int:

138

"""Total number of elements."""

139

140

def resize(self, size: tuple) -> None:

141

"""

142

Resize dataset.

143

144

Parameters:

145

- size: New dataset shape

146

"""

147

148

class HDF_Dataframe:

149

"""HDF DataFrame wrapper with pandas-like interface."""

150

151

def __init__(self, hdf_group):

152

"""Initialize from HDF group containing DataFrame data."""

153

154

def to_pandas(self) -> pd.DataFrame:

155

"""

156

Convert to pandas DataFrame.

157

158

Returns:

159

pandas DataFrame with all data loaded into memory

160

"""

161

162

def __getitem__(self, key) -> pd.Series:

163

"""

164

Access DataFrame columns.

165

166

Parameters:

167

- key: Column name

168

169

Returns:

170

pandas Series with column data

171

"""

172

173

def __setitem__(self, key: str, value):

174

"""

175

Set DataFrame column.

176

177

Parameters:

178

- key: Column name

179

- value: Column data

180

"""

181

182

@property

183

def columns(self) -> list:

184

"""Get DataFrame column names."""

185

186

@property

187

def shape(self) -> tuple:

188

"""Get DataFrame shape."""

189

190

def head(self, n: int = 5) -> pd.DataFrame:

191

"""

192

Get first n rows as pandas DataFrame.

193

194

Parameters:

195

- n: Number of rows to return

196

197

Returns:

198

pandas DataFrame with first n rows

199

"""

200

201

def tail(self, n: int = 5) -> pd.DataFrame:

202

"""

203

Get last n rows as pandas DataFrame.

204

205

Parameters:

206

- n: Number of rows to return

207

208

Returns:

209

pandas DataFrame with last n rows

210

"""

211

212

class HDF_Object:

213

"""Base class for HDF components with common functionality."""

214

215

def __init__(self, hdf_obj):

216

"""Initialize from h5py object."""

217

218

@property

219

def attrs(self) -> dict:

220

"""Access HDF5 attributes as dictionary."""

221

222

def set_attr(self, name: str, value) -> None:

223

"""

224

Set HDF5 attribute.

225

226

Parameters:

227

- name: Attribute name

228

- value: Attribute value

229

"""

230

231

def get_attr(self, name: str, default=None):

232

"""

233

Get HDF5 attribute.

234

235

Parameters:

236

- name: Attribute name

237

- default: Default value if attribute doesn't exist

238

239

Returns:

240

Attribute value or default

241

"""

242

```

243

244

### Memory-Mapped Arrays

245

246

High-performance memory-mapped array operations for handling large datasets that don't fit in memory.

247

248

```python { .api }

249

def redefine_temp_location(temp_dir: str) -> None:

250

"""

251

Change temporary file storage location.

252

253

Parameters:

254

- temp_dir: New directory for temporary files

255

"""

256

257

def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:

258

"""

259

Initialize empty HDF5 file for memory mapping.

260

261

Parameters:

262

- filepath: Path for new HDF5 file

263

- shape: Array shape to create

264

- dtype: Data type

265

"""

266

267

def mmap_array_from_path(filepath: str, dataset_name: str = 'data',

268

mode: str = 'r') -> np.ndarray:

269

"""

270

Reconnect to existing memory-mapped file.

271

272

Parameters:

273

- filepath: Path to existing HDF5 file

274

- dataset_name: Name of dataset in HDF5 file

275

- mode: Access mode ('r', 'r+', 'w')

276

277

Returns:

278

Memory-mapped array connected to file

279

"""

280

281

def array(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:

282

"""

283

Create temporary memory-mapped array.

284

285

Parameters:

286

- shape: Array shape

287

- dtype: Data type (default: float64)

288

- **kwargs: Additional numpy.memmap options

289

290

Returns:

291

Memory-mapped numpy array

292

"""

293

294

def create_empty_mmap(filepath: str, shape: tuple, dtype=np.float64) -> None:

295

"""

296

Initialize empty HDF5 file for memory mapping.

297

298

Parameters:

299

- filepath: Path for new HDF5 file

300

- shape: Array shape to create

301

- dtype: Data type

302

"""

303

304

def mmap_array_from_path(filepath: str, dataset_name: str = 'data',

305

mode: str = 'r') -> np.ndarray:

306

"""

307

Reconnect to existing memory-mapped file.

308

309

Parameters:

310

- filepath: Path to existing HDF5 file

311

- dataset_name: Name of dataset in HDF5 file

312

- mode: Access mode ('r', 'r+', 'w')

313

314

Returns:

315

Memory-mapped array connected to file

316

"""

317

318

def zeros(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:

319

"""

320

Create zero-filled temporary memory-mapped array.

321

322

Parameters:

323

- shape: Array shape

324

- dtype: Data type (default: float64)

325

- **kwargs: Additional options

326

327

Returns:

328

Zero-filled memory-mapped array

329

"""

330

331

def ones(shape: tuple, dtype=np.float64, **kwargs) -> np.ndarray:

332

"""

333

Create ones-filled temporary memory-mapped array.

334

335

Parameters:

336

- shape: Array shape

337

- dtype: Data type (default: float64)

338

- **kwargs: Additional options

339

340

Returns:

341

Ones-filled memory-mapped array

342

"""

343

344

def clear() -> None:

345

"""

346

Clear temporary memory-mapped file directory.

347

Removes all temporary files created by this session.

348

"""

349

350

def get_temp_dir() -> str:

351

"""

352

Get current temporary directory location.

353

354

Returns:

355

Path to temporary directory

356

"""

357

358

def get_available_memory() -> int:

359

"""

360

Get available system memory in bytes.

361

362

Returns:

363

Available memory in bytes

364

"""

365

366

def estimate_memory_usage(shape: tuple, dtype=np.float64) -> int:

367

"""

368

Estimate memory usage for array with given shape and dtype.

369

370

Parameters:

371

- shape: Array shape

372

- dtype: Data type

373

374

Returns:

375

Estimated memory usage in bytes

376

"""

377

```

378

379

### Utility Functions

380

381

Additional I/O utility functions for data processing and file management.

382

383

```python { .api }

384

def save_dataframe_hdf(df: pd.DataFrame, filepath: str, key: str = 'data',

385

**kwargs) -> None:

386

"""

387

Save pandas DataFrame to HDF5 format with optimization.

388

389

Parameters:

390

- df: DataFrame to save

391

- filepath: Output HDF5 file path

392

- key: Dataset key in HDF5 file

393

- **kwargs: Additional pandas.to_hdf options

394

"""

395

396

def load_dataframe_hdf(filepath: str, key: str = 'data',

397

**kwargs) -> pd.DataFrame:

398

"""

399

Load pandas DataFrame from HDF5 format.

400

401

Parameters:

402

- filepath: Input HDF5 file path

403

- key: Dataset key in HDF5 file

404

- **kwargs: Additional pandas.read_hdf options

405

406

Returns:

407

Loaded pandas DataFrame

408

"""

409

410

def get_hdf_info(filepath: str) -> dict:

411

"""

412

Get comprehensive information about HDF5 file contents.

413

414

Parameters:

415

- filepath: Path to HDF5 file

416

417

Returns:

418

Dictionary with file structure and metadata

419

"""

420

421

def compress_hdf_file(input_path: str, output_path: str,

422

compression: str = 'gzip') -> None:

423

"""

424

Compress HDF5 file to reduce size.

425

426

Parameters:

427

- input_path: Input HDF5 file

428

- output_path: Output compressed HDF5 file

429

- compression: Compression algorithm ('gzip', 'lzf', 'szip')

430

"""

431

432

def merge_hdf_files(file_paths: List[str], output_path: str) -> None:

433

"""

434

Merge multiple HDF5 files into single file.

435

436

Parameters:

437

- file_paths: List of HDF5 files to merge

438

- output_path: Output merged HDF5 file path

439

"""

440

```

441

442

## Usage Examples

443

444

### Basic HDF5 Operations

445

446

```python

447

from alphabase.io.hdf import HDF_File

448

import pandas as pd

449

import numpy as np

450

451

# Create or open HDF5 file

452

with HDF_File('data.h5', mode='w') as hf:

453

# Store numpy array

454

data_array = np.random.randn(1000, 50)

455

hf['array_data'] = data_array

456

457

# Store pandas DataFrame

458

df = pd.DataFrame({

459

'sequence': ['PEPTIDE', 'SEQUENCE'],

460

'charge': [2, 3],

461

'mz': [123.45, 234.56]

462

})

463

hf['precursors'] = df

464

465

# Create groups for organization

466

group = hf.create_group('experiments')

467

group['exp1'] = np.random.randn(500, 10)

468

group['exp2'] = np.random.randn(300, 15)

469

470

# Set attributes

471

hf.set_attr('version', '1.0')

472

hf.set_attr('created_by', 'alphabase')

473

474

# Read data back

475

with HDF_File('data.h5', mode='r') as hf:

476

# Access using dictionary syntax

477

array_data = hf['array_data'][:] # Load full array

478

precursor_df = hf['precursors'].to_pandas()

479

480

# Access using attribute syntax

481

exp1_data = hf.experiments.exp1[:]

482

483

# Check file contents

484

print(f"Keys: {hf.keys()}")

485

print(f"Version: {hf.get_attr('version')}")

486

```

487

488

### Memory-Mapped Arrays for Large Data

489

490

```python

491

from alphabase.io.tempmmap import array, zeros, ones, clear

492

493

# Create large memory-mapped arrays that don't fit in RAM

494

large_shape = (1000000, 100) # 100M x 100 = 10B elements

495

496

# Create zero-filled memory-mapped array

497

large_zeros = zeros(large_shape, dtype=np.float32)

498

print(f"Created array shape: {large_zeros.shape}")

499

500

# Create ones-filled array

501

large_ones = ones((500000, 200), dtype=np.float64)

502

503

# Create empty array for computation

504

workspace = array((100000, 500), dtype=np.float32)

505

506

# Use arrays in computations without loading all data into memory

507

for i in range(0, large_shape[0], 10000):

508

# Process in chunks

509

chunk = large_zeros[i:i+10000]

510

# Perform operations on chunk

511

chunk[:] = np.random.randn(chunk.shape[0], chunk.shape[1])

512

513

# Clean up temporary files when done

514

clear()

515

```

516

517

### Advanced HDF5 Operations

518

519

```python

520

from alphabase.io.hdf import HDF_File

521

from alphabase.spectral_library.base import SpecLibBase

522

523

# Save spectral library to HDF5

524

spec_lib = SpecLibBase()

525

# ... populate library ...

526

527

with HDF_File('spectral_library.h5', mode='w') as hf:

528

# Save each DataFrame to separate group

529

lib_group = hf.create_group('spectral_library')

530

lib_group['precursors'] = spec_lib.precursor_df

531

lib_group['fragments_mz'] = spec_lib.fragment_mz_df

532

lib_group['fragments_intensity'] = spec_lib.fragment_intensity_df

533

534

# Add metadata

535

lib_group.set_attr('num_precursors', len(spec_lib.precursor_df))

536

lib_group.set_attr('format_version', '2.0')

537

lib_group.set_attr('creation_date', str(pd.Timestamp.now()))

538

539

# Load spectral library from HDF5

540

new_lib = SpecLibBase()

541

with HDF_File('spectral_library.h5', mode='r') as hf:

542

lib_group = hf['spectral_library']

543

new_lib.precursor_df = lib_group['precursors'].to_pandas()

544

new_lib.fragment_mz_df = lib_group['fragments_mz'].to_pandas()

545

new_lib.fragment_intensity_df = lib_group['fragments_intensity'].to_pandas()

546

547

# Read metadata

548

num_precursors = lib_group.get_attr('num_precursors')

549

print(f"Loaded library with {num_precursors} precursors")

550

```

551

552

### Efficient Data Processing Workflows

553

554

```python

555

from alphabase.io.hdf import HDF_File

556

from alphabase.io.tempmmap import array

557

import numpy as np

558

559

# Process large dataset in chunks using HDF5 and memory mapping

560

input_file = 'large_dataset.h5'

561

output_file = 'processed_dataset.h5'

562

563

with HDF_File(input_file, 'r') as input_hf, \

564

HDF_File(output_file, 'w') as output_hf:

565

566

# Get input data info

567

input_data = input_hf['raw_data']

568

total_rows = input_data.shape[0]

569

chunk_size = 10000

570

571

# Create output dataset

572

output_hf.create_dataset('processed_data',

573

shape=input_data.shape,

574

dtype=np.float32)

575

576

# Create temporary workspace

577

workspace = array((chunk_size, input_data.shape[1]), dtype=np.float32)

578

579

# Process in chunks

580

for i in range(0, total_rows, chunk_size):

581

end_idx = min(i + chunk_size, total_rows)

582

583

# Load chunk

584

chunk = input_data[i:end_idx]

585

586

# Process data (example: normalize)

587

workspace[:chunk.shape[0]] = chunk

588

workspace[:chunk.shape[0]] = (workspace[:chunk.shape[0]] -

589

workspace[:chunk.shape[0]].mean(axis=1, keepdims=True))

590

591

# Save processed chunk

592

output_hf['processed_data'][i:end_idx] = workspace[:chunk.shape[0]]

593

594

print(f"Processed {end_idx}/{total_rows} rows")

595

596

print("Processing complete!")

597

```

598

599

### File Management and Utilities

600

601

```python

602

from alphabase.io.hdf import get_hdf_info

603

604

# Get information about HDF5 file structure

605

file_info = get_hdf_info('spectral_library.h5')

606

print(f"File info: {file_info}")

607

608

# Check available memory before creating large arrays

609

from alphabase.io.tempmmap import get_available_memory, estimate_memory_usage

610

611

available = get_available_memory()

612

required = estimate_memory_usage((1000000, 100), dtype=np.float64)

613

614

print(f"Available memory: {available / 1e9:.1f} GB")

615

print(f"Required memory: {required / 1e9:.1f} GB")

616

617

if required < available * 0.8: # Use max 80% of available memory

618

large_array = array((1000000, 100), dtype=np.float64)

619

print("Array created successfully")

620

else:

621

print("Not enough memory, using smaller chunks")

622

```