or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

alignment-files.mdbgzf-files.mdcommand-tools.mdindex.mdsequence-files.mdtabix-files.mdutilities.mdvariant-files.md

bgzf-files.mddocs/

0

# BGZF Compressed Files

1

2

Support for reading and writing block gzip (BGZF) compressed files, the standard compression format used in genomics for SAM/BAM files and tabix-indexed files.

3

4

## Capabilities

5

6

### BGZFile

7

8

Interface for reading and writing BGZF compressed files with seeking and block-level access.

9

10

```python { .api }

11

class BGZFile:

12

def __init__(self, filepath, mode, compresslevel=6, threads=1):

13

"""

14

Open BGZF compressed file.

15

16

Parameters:

17

- filepath: str, path to BGZF file

18

- mode: str, file mode ('r', 'w', 'rb', 'wb', 'rt', 'wt')

19

- compresslevel: int, compression level (0-9, default 6)

20

- threads: int, number of compression threads

21

22

Returns:

23

BGZFile object

24

"""

25

26

def read(self, size=-1):

27

"""

28

Read data from file.

29

30

Parameters:

31

- size: int, number of bytes to read (-1 for all)

32

33

Returns:

34

bytes or str, data read from file

35

"""

36

37

def readline(self, size=-1):

38

"""

39

Read single line from file.

40

41

Parameters:

42

- size: int, maximum bytes to read

43

44

Returns:

45

bytes or str, line data

46

"""

47

48

def readlines(self, hint=-1):

49

"""

50

Read all lines from file.

51

52

Parameters:

53

- hint: int, approximate number of bytes to read

54

55

Returns:

56

list, lines from file

57

"""

58

59

def write(self, data):

60

"""

61

Write data to file.

62

63

Parameters:

64

- data: bytes or str, data to write

65

66

Returns:

67

int, number of bytes written

68

"""

69

70

def writelines(self, lines):

71

"""

72

Write multiple lines to file.

73

74

Parameters:

75

- lines: iterable, lines to write

76

"""

77

78

def seek(self, offset, whence=0):

79

"""

80

Seek to position in file.

81

82

Parameters:

83

- offset: int, byte offset

84

- whence: int, seek reference (0=start, 1=current, 2=end)

85

86

Returns:

87

int, new file position

88

"""

89

90

def tell(self):

91

"""

92

Get current file position.

93

94

Returns:

95

int, current byte position

96

"""

97

98

def flush(self):

99

"""Flush write buffers."""

100

101

def close(self):

102

"""Close the file."""

103

104

def truncate(self, size=None):

105

"""

106

Truncate file to specified size.

107

108

Parameters:

109

- size: int, size in bytes (current position if None)

110

"""

111

112

# Properties

113

@property

114

def mode(self) -> str:

115

"""File mode."""

116

117

@property

118

def name(self) -> str:

119

"""File name."""

120

121

@property

122

def closed(self) -> bool:

123

"""True if file is closed."""

124

125

@property

126

def readable(self) -> bool:

127

"""True if file is readable."""

128

129

@property

130

def writable(self) -> bool:

131

"""True if file is writable."""

132

133

@property

134

def seekable(self) -> bool:

135

"""True if file supports seeking."""

136

137

# Context manager support

138

def __enter__(self):

139

"""Context manager entry."""

140

141

def __exit__(self, exc_type, exc_val, exc_tb):

142

"""Context manager exit."""

143

144

# Iterator support

145

def __iter__(self):

146

"""Iterate over lines."""

147

148

def __next__(self):

149

"""Get next line."""

150

```

151

152

## Usage Examples

153

154

### Basic File Operations

155

156

```python

157

import pysam

158

159

# Reading BGZF files

160

with pysam.BGZFile("data.txt.gz", "rt") as infile:

161

# Read entire file

162

content = infile.read()

163

print(f"File content: {content}")

164

165

# Reading line by line

166

with pysam.BGZFile("data.txt.gz", "rt") as infile:

167

for line_num, line in enumerate(infile, 1):

168

print(f"Line {line_num}: {line.strip()}")

169

170

# Reading specific amount of data

171

with pysam.BGZFile("data.txt.gz", "rb") as infile:

172

chunk = infile.read(1024) # Read first 1KB

173

print(f"First chunk: {len(chunk)} bytes")

174

```

175

176

### Writing BGZF Files

177

178

```python

179

import pysam

180

181

# Writing text data

182

with pysam.BGZFile("output.txt.gz", "wt", compresslevel=9) as outfile:

183

outfile.write("Header line\n")

184

for i in range(1000):

185

outfile.write(f"Data line {i}\n")

186

187

# Writing binary data

188

with pysam.BGZFile("output.bin.gz", "wb") as outfile:

189

data = b"Binary data chunk"

190

outfile.write(data)

191

192

# Writing with multiple threads for better compression speed

193

with pysam.BGZFile("large_output.txt.gz", "wt", threads=4) as outfile:

194

for i in range(1000000):

195

outfile.write(f"Large dataset line {i}\n")

196

```

197

198

### File Seeking and Random Access

199

200

```python

201

import pysam

202

203

# Seeking in BGZF files (supports random access)

204

with pysam.BGZFile("indexed_data.txt.gz", "rt") as infile:

205

# Read from beginning

206

first_line = infile.readline()

207

print(f"First line: {first_line.strip()}")

208

209

# Remember position

210

pos = infile.tell()

211

print(f"Current position: {pos}")

212

213

# Read more data

214

second_line = infile.readline()

215

216

# Seek back to previous position

217

infile.seek(pos)

218

219

# Read same line again

220

second_line_again = infile.readline()

221

assert second_line == second_line_again

222

223

# Seek to end and get file size

224

infile.seek(0, 2) # Seek to end

225

file_size = infile.tell()

226

print(f"File size: {file_size} bytes")

227

```

228

229

### Processing Large Files

230

231

```python

232

import pysam

233

234

def process_large_bgzf_file(filename, chunk_size=8192):

235

"""Process large BGZF file in chunks to manage memory usage."""

236

with pysam.BGZFile(filename, "rt") as infile:

237

processed_lines = 0

238

239

while True:

240

chunk = infile.read(chunk_size)

241

if not chunk:

242

break

243

244

# Process chunk line by line

245

lines = chunk.split('\n')

246

247

# Handle partial line at end of chunk

248

if not chunk.endswith('\n') and lines:

249

# Save last partial line for next iteration

250

partial_line = lines[-1]

251

lines = lines[:-1]

252

253

# Seek back to start of partial line

254

infile.seek(infile.tell() - len(partial_line.encode()))

255

256

# Process complete lines

257

for line in lines:

258

if line.strip(): # Skip empty lines

259

# Process line here

260

processed_lines += 1

261

262

if processed_lines % 10000 == 0:

263

print(f"Processed {processed_lines} lines")

264

265

return processed_lines

266

267

# Usage

268

total_lines = process_large_bgzf_file("large_dataset.txt.gz")

269

print(f"Total processed lines: {total_lines}")

270

```

271

272

### File Compression and Conversion

273

274

```python

275

import pysam

276

277

def compress_to_bgzf(input_file, output_file, compression_level=6):

278

"""Compress regular file to BGZF format."""

279

with open(input_file, 'rb') as infile:

280

with pysam.BGZFile(output_file, 'wb', compresslevel=compression_level) as outfile:

281

# Copy data in chunks

282

chunk_size = 64 * 1024 # 64KB chunks

283

while True:

284

chunk = infile.read(chunk_size)

285

if not chunk:

286

break

287

outfile.write(chunk)

288

289

def decompress_bgzf(input_file, output_file):

290

"""Decompress BGZF file to regular file."""

291

with pysam.BGZFile(input_file, 'rb') as infile:

292

with open(output_file, 'wb') as outfile:

293

# Copy data in chunks

294

chunk_size = 64 * 1024 # 64KB chunks

295

while True:

296

chunk = infile.read(chunk_size)

297

if not chunk:

298

break

299

outfile.write(chunk)

300

301

# Usage

302

compress_to_bgzf("large_file.txt", "large_file.txt.gz", compression_level=9)

303

decompress_bgzf("compressed_file.txt.gz", "decompressed_file.txt")

304

```

305

306

### Advanced File Operations

307

308

```python

309

import pysam

310

import os

311

312

def split_bgzf_file(input_file, output_prefix, lines_per_file=1000000):

313

"""Split large BGZF file into smaller files."""

314

file_count = 0

315

current_lines = 0

316

current_file = None

317

318

try:

319

with pysam.BGZFile(input_file, "rt") as infile:

320

for line in infile:

321

# Open new output file if needed

322

if current_lines == 0:

323

if current_file:

324

current_file.close()

325

326

file_count += 1

327

output_filename = f"{output_prefix}_{file_count:03d}.txt.gz"

328

current_file = pysam.BGZFile(output_filename, "wt")

329

330

# Write line to current file

331

current_file.write(line)

332

current_lines += 1

333

334

# Check if we need to start new file

335

if current_lines >= lines_per_file:

336

current_lines = 0

337

338

finally:

339

if current_file:

340

current_file.close()

341

342

return file_count

343

344

def merge_bgzf_files(input_files, output_file):

345

"""Merge multiple BGZF files into one."""

346

with pysam.BGZFile(output_file, "wt") as outfile:

347

for input_file in input_files:

348

with pysam.BGZFile(input_file, "rt") as infile:

349

# Copy all lines from input to output

350

for line in infile:

351

outfile.write(line)

352

353

# Usage

354

num_files = split_bgzf_file("huge_dataset.txt.gz", "split_part", 500000)

355

print(f"Split into {num_files} files")

356

357

# Merge them back

358

input_files = [f"split_part_{i:03d}.txt.gz" for i in range(1, num_files + 1)]

359

merge_bgzf_files(input_files, "merged_dataset.txt.gz")

360

```

361

362

### Integration with Other Pysam Components

363

364

```python

365

import pysam

366

367

# Create custom tabix-compatible file

368

def create_bed_file_with_bgzf(features, output_file):

369

"""Create sorted, BGZF-compressed BED file suitable for tabix indexing."""

370

371

# Sort features by chromosome and position

372

sorted_features = sorted(features, key=lambda x: (x['chrom'], x['start']))

373

374

# Write to BGZF file

375

with pysam.BGZFile(output_file, "wt") as outfile:

376

for feature in sorted_features:

377

line = f"{feature['chrom']}\t{feature['start']}\t{feature['end']}\t{feature['name']}\n"

378

outfile.write(line)

379

380

# Create tabix index

381

pysam.tabix_index(output_file, preset="bed")

382

383

# Example usage

384

features = [

385

{'chrom': 'chr1', 'start': 1000, 'end': 2000, 'name': 'feature1'},

386

{'chrom': 'chr1', 'start': 1500, 'end': 2500, 'name': 'feature2'},

387

{'chrom': 'chr2', 'start': 500, 'end': 1500, 'name': 'feature3'},

388

]

389

390

create_bed_file_with_bgzf(features, "features.bed.gz")

391

392

# Now can use with TabixFile

393

with pysam.TabixFile("features.bed.gz", parser=pysam.asBed()) as tabixfile:

394

for record in tabixfile.fetch("chr1", 1200, 1800):

395

print(f"Overlapping feature: {record.name}")

396

```

397

398

## Performance Considerations

399

400

- BGZF files support random access, unlike regular gzip files

401

- Seeking is efficient due to block-based compression structure

402

- Multi-threaded compression (`threads` parameter) can significantly speed up writing

403

- Block size is optimized for genomic data access patterns

404

- Compatible with standard gzip tools for decompression

405

- Essential format for indexed genomic files (BAM, tabix)