or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio.mdcodecs.mdcontainers.mdfilters.mdindex.mdstreams.mdvideo.md

audio.mddocs/

0

# Audio Processing

1

2

Comprehensive audio handling capabilities including frames, streams, format conversion, resampling, and FIFO buffering. PyAV provides full access to FFmpeg's audio processing with NumPy integration.

3

4

## Capabilities

5

6

### Audio Frames

7

8

Audio frame objects contain uncompressed audio data with format and timing information.

9

10

```python { .api }

11

class AudioFrame:

12

"""Container for uncompressed audio data."""

13

14

# Properties

15

samples: int # Number of audio samples

16

sample_rate: int # Sample rate in Hz

17

rate: int # Alias for sample_rate

18

format: AudioFormat # Audio sample format

19

layout: AudioLayout # Channel layout

20

planes: tuple[AudioPlane, ...] # Audio data planes

21

pts: int # Presentation timestamp

22

time: float # Time in seconds

23

side_data: SideDataContainer # Additional frame data

24

25

def __init__(self, format='s16', layout='stereo', samples=0, align=1):

26

"""

27

Create an audio frame.

28

29

Parameters:

30

- format: str | AudioFormat - Sample format

31

- layout: str | AudioLayout - Channel layout

32

- samples: int - Number of samples per channel

33

- align: int - Memory alignment

34

"""

35

36

@staticmethod

37

def from_ndarray(array, format='s16', layout='stereo') -> 'AudioFrame':

38

"""

39

Create frame from NumPy array.

40

41

Parameters:

42

- array: np.ndarray - Audio data array

43

- format: str - Target sample format

44

- layout: str - Channel layout

45

46

Returns:

47

New AudioFrame object

48

"""

49

50

def to_ndarray(self, format=None) -> np.ndarray:

51

"""

52

Convert to NumPy array.

53

54

Parameters:

55

- format: str - Target format (None uses current format)

56

57

Returns:

58

NumPy array with audio data

59

"""

60

61

def make_writable(self) -> None:

62

"""Ensure frame data is writable."""

63

```

64

65

### Audio Formats

66

67

Audio sample format specifications and conversions.

68

69

```python { .api }

70

class AudioFormat:

71

"""Audio sample format specification."""

72

73

# Properties

74

name: str # Format name (e.g., 's16', 'flt')

75

bytes: int # Bytes per sample

76

bits: int # Bits per sample

77

is_planar: bool # True if planar format

78

is_packed: bool # True if packed format

79

planar: 'AudioFormat' # Equivalent planar format

80

packed: 'AudioFormat' # Equivalent packed format

81

container_name: str # Container-friendly name

82

83

def __init__(self, name):

84

"""

85

Create audio format.

86

87

Parameters:

88

- name: str | AudioFormat - Format name or existing format

89

"""

90

```

91

92

### Audio Layouts

93

94

Channel layout specifications for multi-channel audio.

95

96

```python { .api }

97

class AudioLayout:

98

"""Audio channel layout specification."""

99

100

# Properties

101

name: str # Layout name (e.g., 'mono', 'stereo', '5.1')

102

nb_channels: int # Number of channels

103

channels: tuple[AudioChannel, ...] # Individual channel objects

104

105

def __init__(self, layout):

106

"""

107

Create audio layout.

108

109

Parameters:

110

- layout: str | int | AudioLayout - Layout specification

111

"""

112

113

class AudioChannel:

114

"""Individual audio channel."""

115

116

name: str # Channel name (e.g., 'FL', 'FR', 'C')

117

description: str # Human-readable description

118

```

119

120

### Audio Resampling

121

122

Audio format conversion and resampling for compatibility between different audio specifications.

123

124

```python { .api }

125

class AudioResampler:

126

"""Audio format converter and resampler."""

127

128

# Properties

129

rate: int # Output sample rate

130

frame_size: int # Output frame size

131

format: AudioFormat # Output format

132

graph: Graph | None # Filter graph used

133

134

def __init__(self, format=None, layout=None, rate=None, frame_size=None):

135

"""

136

Create audio resampler.

137

138

Parameters:

139

- format: str | AudioFormat - Output format

140

- layout: str | AudioLayout - Output layout

141

- rate: int - Output sample rate

142

- frame_size: int - Output frame size

143

"""

144

145

def resample(self, frame=None) -> list[AudioFrame]:

146

"""

147

Resample audio frame.

148

149

Parameters:

150

- frame: AudioFrame | None - Input frame (None flushes)

151

152

Returns:

153

List of resampled frames

154

"""

155

```

156

157

### Audio FIFO

158

159

First-in-first-out buffer for audio frames, useful for managing variable frame sizes.

160

161

```python { .api }

162

class AudioFifo:

163

"""FIFO buffer for audio frames."""

164

165

# Properties

166

format: AudioFormat # Audio format

167

layout: AudioLayout # Channel layout

168

sample_rate: int # Sample rate

169

samples: int # Current samples in buffer

170

samples_written: int # Total samples written

171

samples_read: int # Total samples read

172

pts_per_sample: Fraction # PTS increment per sample

173

174

def __init__(self, format='s16', layout='stereo', sample_rate=48000):

175

"""

176

Create audio FIFO.

177

178

Parameters:

179

- format: str - Audio format

180

- layout: str - Channel layout

181

- sample_rate: int - Sample rate

182

"""

183

184

def write(self, frame) -> None:

185

"""

186

Write frame to FIFO.

187

188

Parameters:

189

- frame: AudioFrame - Frame to write

190

"""

191

192

def read(self, samples=0, partial=False) -> AudioFrame | None:

193

"""

194

Read frame from FIFO.

195

196

Parameters:

197

- samples: int - Number of samples to read (0 for all)

198

- partial: bool - Allow partial reads

199

200

Returns:

201

AudioFrame or None if insufficient data

202

"""

203

204

def read_many(self, samples, partial=True) -> list[AudioFrame]:

205

"""

206

Read multiple frames.

207

208

Parameters:

209

- samples: int - Samples per frame

210

- partial: bool - Allow partial final frame

211

212

Returns:

213

List of audio frames

214

"""

215

```

216

217

### Audio Streams

218

219

Audio stream objects for encoding and decoding.

220

221

```python { .api }

222

class AudioStream:

223

"""Audio stream in a container."""

224

225

# Properties

226

type: Literal['audio'] # Stream type

227

codec_context: AudioCodecContext # Codec context

228

frame_size: int # Encoder frame size

229

sample_rate: int # Sample rate

230

rate: int # Alias for sample_rate

231

bit_rate: int # Bitrate

232

channels: int # Number of channels

233

format: AudioFormat # Sample format

234

layout: AudioLayout # Channel layout

235

236

def encode(self, frame=None) -> list[Packet]:

237

"""

238

Encode audio frame.

239

240

Parameters:

241

- frame: AudioFrame | None - Frame to encode (None flushes)

242

243

Returns:

244

List of encoded packets

245

"""

246

247

def decode(self, packet=None) -> list[AudioFrame]:

248

"""

249

Decode audio packet.

250

251

Parameters:

252

- packet: Packet | None - Packet to decode (None flushes)

253

254

Returns:

255

List of decoded frames

256

"""

257

```

258

259

### Audio Codec Context

260

261

Audio-specific codec context for encoding and decoding.

262

263

```python { .api }

264

class AudioCodecContext:

265

"""Audio codec context."""

266

267

# Properties

268

type: Literal['audio'] # Context type

269

frame_size: int # Samples per frame

270

sample_rate: int # Sample rate

271

rate: int # Alias for sample_rate

272

format: AudioFormat # Sample format

273

layout: AudioLayout # Channel layout

274

channels: int # Number of channels

275

bit_rate: int # Target bitrate

276

277

def encode(self, frame=None) -> list[Packet]:

278

"""Encode audio frame to packets."""

279

280

def encode_lazy(self, frame=None) -> Iterator[Packet]:

281

"""Lazy encoding iterator."""

282

283

def decode(self, packet=None) -> list[AudioFrame]:

284

"""Decode packet to audio frames."""

285

```

286

287

### Audio Planes

288

289

Individual audio data planes for planar formats.

290

291

```python { .api }

292

class AudioPlane:

293

"""Audio data plane."""

294

295

buffer_size: int # Size of audio buffer

296

frame: AudioFrame # Parent frame

297

index: int # Plane index

298

299

# Inherits Buffer methods for data access

300

def update(self, input: bytes) -> None: ...

301

def __buffer__(self, flags: int) -> memoryview: ...

302

def __bytes__(self) -> bytes: ...

303

```

304

305

## Usage Examples

306

307

### Basic Audio Processing

308

309

```python

310

import av

311

import numpy as np

312

313

# Open audio file

314

container = av.open('audio.wav')

315

audio_stream = container.streams.audio[0]

316

317

print(f"Sample rate: {audio_stream.sample_rate}")

318

print(f"Channels: {audio_stream.channels}")

319

print(f"Format: {audio_stream.format}")

320

321

# Decode all frames

322

for frame in container.decode(audio_stream):

323

# Convert to numpy array

324

array = frame.to_ndarray()

325

print(f"Frame: {array.shape} samples")

326

327

# Process audio data

328

processed = np.multiply(array, 0.5) # Reduce volume

329

330

# Create new frame from processed data

331

new_frame = av.AudioFrame.from_ndarray(

332

processed,

333

format=frame.format.name,

334

layout=frame.layout.name,

335

sample_rate=frame.sample_rate

336

)

337

338

container.close()

339

```

340

341

### Audio Format Conversion

342

343

```python

344

import av

345

346

# Setup resampler

347

resampler = av.AudioResampler(

348

format='s16', # 16-bit signed integer

349

layout='stereo', # 2 channels

350

rate=44100 # 44.1kHz

351

)

352

353

# Open input

354

container = av.open('input.flac')

355

stream = container.streams.audio[0]

356

357

# Process frames

358

for frame in container.decode(stream):

359

# Resample to target format

360

resampled_frames = resampler.resample(frame)

361

362

for resampled_frame in resampled_frames:

363

print(f"Resampled: {resampled_frame.format.name} "

364

f"{resampled_frame.layout.name} "

365

f"{resampled_frame.sample_rate}Hz")

366

367

# Flush resampler

368

final_frames = resampler.resample(None)

369

for frame in final_frames:

370

print(f"Final frame: {frame.samples} samples")

371

372

container.close()

373

```

374

375

### Audio Encoding

376

377

```python

378

import av

379

import numpy as np

380

381

# Create output container

382

output = av.open('output.aac', 'w')

383

384

# Add audio stream

385

stream = output.add_stream('aac', rate=44100)

386

stream.channels = 2

387

stream.layout = 'stereo'

388

stream.sample_rate = 44100

389

390

# Create FIFO for frame size management

391

fifo = av.AudioFifo(

392

format=stream.format.name,

393

layout=stream.layout.name,

394

sample_rate=stream.sample_rate

395

)

396

397

# Generate audio data

398

duration = 5.0 # seconds

399

sample_count = int(duration * stream.sample_rate)

400

t = np.linspace(0, duration, sample_count)

401

frequency = 440 # A4 note

402

403

# Generate stereo sine wave

404

left_channel = np.sin(2 * np.pi * frequency * t) * 0.3

405

right_channel = np.sin(2 * np.pi * frequency * 1.5 * t) * 0.3

406

audio_data = np.column_stack([left_channel, right_channel])

407

408

# Create frame and write to FIFO

409

frame = av.AudioFrame.from_ndarray(

410

audio_data.astype(np.float32),

411

format='flt',

412

layout='stereo',

413

sample_rate=stream.sample_rate

414

)

415

fifo.write(frame)

416

417

# Read and encode in codec-appropriate frame sizes

418

frame_count = 0

419

while fifo.samples >= stream.frame_size:

420

frame = fifo.read(stream.frame_size)

421

frame.pts = frame_count * stream.frame_size

422

frame.time_base = stream.time_base

423

424

for packet in stream.encode(frame):

425

output.mux(packet)

426

427

frame_count += 1

428

429

# Flush encoder

430

for packet in stream.encode():

431

output.mux(packet)

432

433

output.close()

434

```

435

436

### Multi-Channel Audio Processing

437

438

```python

439

import av

440

import numpy as np

441

442

# Open 5.1 surround sound file

443

container = av.open('surround.ac3')

444

stream = container.streams.audio[0]

445

446

print(f"Layout: {stream.layout.name}")

447

print(f"Channels: {stream.channels}")

448

for i, channel in enumerate(stream.layout.channels):

449

print(f" Channel {i}: {channel.name} ({channel.description})")

450

451

# Process each channel separately

452

for frame in container.decode(stream):

453

array = frame.to_ndarray()

454

455

if frame.format.is_planar:

456

# Planar format - each channel is separate plane

457

for i, plane in enumerate(frame.planes):

458

channel_data = np.frombuffer(plane, dtype=np.float32)

459

print(f"Channel {i}: {len(channel_data)} samples")

460

else:

461

# Packed format - channels interleaved

462

for i in range(frame.channels):

463

channel_data = array[i::frame.channels]

464

print(f"Channel {i}: {len(channel_data)} samples")

465

466

container.close()

467

```

468

469

### Audio Analysis

470

471

```python

472

import av

473

import numpy as np

474

475

def analyze_audio(filename):

476

container = av.open(filename)

477

stream = container.streams.audio[0]

478

479

# Collect all audio data

480

all_samples = []

481

frame_count = 0

482

483

for frame in container.decode(stream):

484

array = frame.to_ndarray()

485

all_samples.append(array)

486

frame_count += 1

487

488

# Frame-level analysis

489

rms = np.sqrt(np.mean(array**2))

490

peak = np.max(np.abs(array))

491

print(f"Frame {frame_count}: RMS={rms:.3f}, Peak={peak:.3f}")

492

493

# Overall analysis

494

if all_samples:

495

all_audio = np.concatenate(all_samples)

496

duration = len(all_audio) / stream.sample_rate

497

overall_rms = np.sqrt(np.mean(all_audio**2))

498

overall_peak = np.max(np.abs(all_audio))

499

500

print(f"\nOverall Analysis:")

501

print(f"Duration: {duration:.2f} seconds")

502

print(f"RMS Level: {overall_rms:.3f}")

503

print(f"Peak Level: {overall_peak:.3f}")

504

print(f"Dynamic Range: {20*np.log10(overall_peak/overall_rms):.1f} dB")

505

506

container.close()

507

508

# Analyze audio file

509

analyze_audio('music.wav')

510

```