0
# Audio Processing
1
2
Comprehensive audio handling capabilities including frames, streams, format conversion, resampling, and FIFO buffering. PyAV provides full access to FFmpeg's audio processing with NumPy integration.
3
4
## Capabilities
5
6
### Audio Frames
7
8
Audio frame objects contain uncompressed audio data with format and timing information.
9
10
```python { .api }
11
class AudioFrame:
12
"""Container for uncompressed audio data."""
13
14
# Properties
15
samples: int # Number of audio samples
16
sample_rate: int # Sample rate in Hz
17
rate: int # Alias for sample_rate
18
format: AudioFormat # Audio sample format
19
layout: AudioLayout # Channel layout
20
planes: tuple[AudioPlane, ...] # Audio data planes
21
pts: int # Presentation timestamp
22
time: float # Time in seconds
23
side_data: SideDataContainer # Additional frame data
24
25
def __init__(self, format='s16', layout='stereo', samples=0, align=1):
26
"""
27
Create an audio frame.
28
29
Parameters:
30
- format: str | AudioFormat - Sample format
31
- layout: str | AudioLayout - Channel layout
32
- samples: int - Number of samples per channel
33
- align: int - Memory alignment
34
"""
35
36
@staticmethod
37
def from_ndarray(array, format='s16', layout='stereo') -> 'AudioFrame':
38
"""
39
Create frame from NumPy array.
40
41
Parameters:
42
- array: np.ndarray - Audio data array
43
- format: str - Target sample format
44
- layout: str - Channel layout
45
46
Returns:
47
New AudioFrame object
48
"""
49
50
def to_ndarray(self, format=None) -> np.ndarray:
51
"""
52
Convert to NumPy array.
53
54
Parameters:
55
- format: str - Target format (None uses current format)
56
57
Returns:
58
NumPy array with audio data
59
"""
60
61
def make_writable(self) -> None:
62
"""Ensure frame data is writable."""
63
```
64
65
### Audio Formats
66
67
Audio sample format specifications and conversions.
68
69
```python { .api }
70
class AudioFormat:
71
"""Audio sample format specification."""
72
73
# Properties
74
name: str # Format name (e.g., 's16', 'flt')
75
bytes: int # Bytes per sample
76
bits: int # Bits per sample
77
is_planar: bool # True if planar format
78
is_packed: bool # True if packed format
79
planar: 'AudioFormat' # Equivalent planar format
80
packed: 'AudioFormat' # Equivalent packed format
81
container_name: str # Container-friendly name
82
83
def __init__(self, name):
84
"""
85
Create audio format.
86
87
Parameters:
88
- name: str | AudioFormat - Format name or existing format
89
"""
90
```
91
92
### Audio Layouts
93
94
Channel layout specifications for multi-channel audio.
95
96
```python { .api }
97
class AudioLayout:
98
"""Audio channel layout specification."""
99
100
# Properties
101
name: str # Layout name (e.g., 'mono', 'stereo', '5.1')
102
nb_channels: int # Number of channels
103
channels: tuple[AudioChannel, ...] # Individual channel objects
104
105
def __init__(self, layout):
106
"""
107
Create audio layout.
108
109
Parameters:
110
- layout: str | int | AudioLayout - Layout specification
111
"""
112
113
class AudioChannel:
114
"""Individual audio channel."""
115
116
name: str # Channel name (e.g., 'FL', 'FR', 'C')
117
description: str # Human-readable description
118
```
119
120
### Audio Resampling
121
122
Audio format conversion and resampling for compatibility between different audio specifications.
123
124
```python { .api }
125
class AudioResampler:
126
"""Audio format converter and resampler."""
127
128
# Properties
129
rate: int # Output sample rate
130
frame_size: int # Output frame size
131
format: AudioFormat # Output format
132
graph: Graph | None # Filter graph used
133
134
def __init__(self, format=None, layout=None, rate=None, frame_size=None):
135
"""
136
Create audio resampler.
137
138
Parameters:
139
- format: str | AudioFormat - Output format
140
- layout: str | AudioLayout - Output layout
141
- rate: int - Output sample rate
142
- frame_size: int - Output frame size
143
"""
144
145
def resample(self, frame=None) -> list[AudioFrame]:
146
"""
147
Resample audio frame.
148
149
Parameters:
150
- frame: AudioFrame | None - Input frame (None flushes)
151
152
Returns:
153
List of resampled frames
154
"""
155
```
156
157
### Audio FIFO
158
159
First-in-first-out buffer for audio frames, useful for managing variable frame sizes.
160
161
```python { .api }
162
class AudioFifo:
163
"""FIFO buffer for audio frames."""
164
165
# Properties
166
format: AudioFormat # Audio format
167
layout: AudioLayout # Channel layout
168
sample_rate: int # Sample rate
169
samples: int # Current samples in buffer
170
samples_written: int # Total samples written
171
samples_read: int # Total samples read
172
pts_per_sample: Fraction # PTS increment per sample
173
174
def __init__(self, format='s16', layout='stereo', sample_rate=48000):
175
"""
176
Create audio FIFO.
177
178
Parameters:
179
- format: str - Audio format
180
- layout: str - Channel layout
181
- sample_rate: int - Sample rate
182
"""
183
184
def write(self, frame) -> None:
185
"""
186
Write frame to FIFO.
187
188
Parameters:
189
- frame: AudioFrame - Frame to write
190
"""
191
192
def read(self, samples=0, partial=False) -> AudioFrame | None:
193
"""
194
Read frame from FIFO.
195
196
Parameters:
197
- samples: int - Number of samples to read (0 for all)
198
- partial: bool - Allow partial reads
199
200
Returns:
201
AudioFrame or None if insufficient data
202
"""
203
204
def read_many(self, samples, partial=True) -> list[AudioFrame]:
205
"""
206
Read multiple frames.
207
208
Parameters:
209
- samples: int - Samples per frame
210
- partial: bool - Allow partial final frame
211
212
Returns:
213
List of audio frames
214
"""
215
```
216
217
### Audio Streams
218
219
Audio stream objects for encoding and decoding.
220
221
```python { .api }
222
class AudioStream:
223
"""Audio stream in a container."""
224
225
# Properties
226
type: Literal['audio'] # Stream type
227
codec_context: AudioCodecContext # Codec context
228
frame_size: int # Encoder frame size
229
sample_rate: int # Sample rate
230
rate: int # Alias for sample_rate
231
bit_rate: int # Bitrate
232
channels: int # Number of channels
233
format: AudioFormat # Sample format
234
layout: AudioLayout # Channel layout
235
236
def encode(self, frame=None) -> list[Packet]:
237
"""
238
Encode audio frame.
239
240
Parameters:
241
- frame: AudioFrame | None - Frame to encode (None flushes)
242
243
Returns:
244
List of encoded packets
245
"""
246
247
def decode(self, packet=None) -> list[AudioFrame]:
248
"""
249
Decode audio packet.
250
251
Parameters:
252
- packet: Packet | None - Packet to decode (None flushes)
253
254
Returns:
255
List of decoded frames
256
"""
257
```
258
259
### Audio Codec Context
260
261
Audio-specific codec context for encoding and decoding.
262
263
```python { .api }
264
class AudioCodecContext:
265
"""Audio codec context."""
266
267
# Properties
268
type: Literal['audio'] # Context type
269
frame_size: int # Samples per frame
270
sample_rate: int # Sample rate
271
rate: int # Alias for sample_rate
272
format: AudioFormat # Sample format
273
layout: AudioLayout # Channel layout
274
channels: int # Number of channels
275
bit_rate: int # Target bitrate
276
277
def encode(self, frame=None) -> list[Packet]:
278
"""Encode audio frame to packets."""
279
280
def encode_lazy(self, frame=None) -> Iterator[Packet]:
281
"""Lazy encoding iterator."""
282
283
def decode(self, packet=None) -> list[AudioFrame]:
284
"""Decode packet to audio frames."""
285
```
286
287
### Audio Planes
288
289
Individual audio data planes for planar formats.
290
291
```python { .api }
292
class AudioPlane:
293
"""Audio data plane."""
294
295
buffer_size: int # Size of audio buffer
296
frame: AudioFrame # Parent frame
297
index: int # Plane index
298
299
# Inherits Buffer methods for data access
300
def update(self, input: bytes) -> None: ...
301
def __buffer__(self, flags: int) -> memoryview: ...
302
def __bytes__(self) -> bytes: ...
303
```
304
305
## Usage Examples
306
307
### Basic Audio Processing
308
309
```python
310
import av
311
import numpy as np
312
313
# Open audio file
314
container = av.open('audio.wav')
315
audio_stream = container.streams.audio[0]
316
317
print(f"Sample rate: {audio_stream.sample_rate}")
318
print(f"Channels: {audio_stream.channels}")
319
print(f"Format: {audio_stream.format}")
320
321
# Decode all frames
322
for frame in container.decode(audio_stream):
323
# Convert to numpy array
324
array = frame.to_ndarray()
325
print(f"Frame: {array.shape} samples")
326
327
# Process audio data
328
processed = np.multiply(array, 0.5) # Reduce volume
329
330
# Create new frame from processed data
331
new_frame = av.AudioFrame.from_ndarray(
332
processed,
333
format=frame.format.name,
334
layout=frame.layout.name,
335
sample_rate=frame.sample_rate
336
)
337
338
container.close()
339
```
340
341
### Audio Format Conversion
342
343
```python
344
import av
345
346
# Setup resampler
347
resampler = av.AudioResampler(
348
format='s16', # 16-bit signed integer
349
layout='stereo', # 2 channels
350
rate=44100 # 44.1kHz
351
)
352
353
# Open input
354
container = av.open('input.flac')
355
stream = container.streams.audio[0]
356
357
# Process frames
358
for frame in container.decode(stream):
359
# Resample to target format
360
resampled_frames = resampler.resample(frame)
361
362
for resampled_frame in resampled_frames:
363
print(f"Resampled: {resampled_frame.format.name} "
364
f"{resampled_frame.layout.name} "
365
f"{resampled_frame.sample_rate}Hz")
366
367
# Flush resampler
368
final_frames = resampler.resample(None)
369
for frame in final_frames:
370
print(f"Final frame: {frame.samples} samples")
371
372
container.close()
373
```
374
375
### Audio Encoding
376
377
```python
378
import av
379
import numpy as np
380
381
# Create output container
382
output = av.open('output.aac', 'w')
383
384
# Add audio stream
385
stream = output.add_stream('aac', rate=44100)
386
stream.channels = 2
387
stream.layout = 'stereo'
388
stream.sample_rate = 44100
389
390
# Create FIFO for frame size management
391
fifo = av.AudioFifo(
392
format=stream.format.name,
393
layout=stream.layout.name,
394
sample_rate=stream.sample_rate
395
)
396
397
# Generate audio data
398
duration = 5.0 # seconds
399
sample_count = int(duration * stream.sample_rate)
400
t = np.linspace(0, duration, sample_count)
401
frequency = 440 # A4 note
402
403
# Generate stereo sine wave
404
left_channel = np.sin(2 * np.pi * frequency * t) * 0.3
405
right_channel = np.sin(2 * np.pi * frequency * 1.5 * t) * 0.3
406
audio_data = np.column_stack([left_channel, right_channel])
407
408
# Create frame and write to FIFO
409
frame = av.AudioFrame.from_ndarray(
410
audio_data.astype(np.float32),
411
format='flt',
412
layout='stereo',
413
sample_rate=stream.sample_rate
414
)
415
fifo.write(frame)
416
417
# Read and encode in codec-appropriate frame sizes
418
frame_count = 0
419
while fifo.samples >= stream.frame_size:
420
frame = fifo.read(stream.frame_size)
421
frame.pts = frame_count * stream.frame_size
422
frame.time_base = stream.time_base
423
424
for packet in stream.encode(frame):
425
output.mux(packet)
426
427
frame_count += 1
428
429
# Flush encoder
430
for packet in stream.encode():
431
output.mux(packet)
432
433
output.close()
434
```
435
436
### Multi-Channel Audio Processing
437
438
```python
439
import av
440
import numpy as np
441
442
# Open 5.1 surround sound file
443
container = av.open('surround.ac3')
444
stream = container.streams.audio[0]
445
446
print(f"Layout: {stream.layout.name}")
447
print(f"Channels: {stream.channels}")
448
for i, channel in enumerate(stream.layout.channels):
449
print(f" Channel {i}: {channel.name} ({channel.description})")
450
451
# Process each channel separately
452
for frame in container.decode(stream):
453
array = frame.to_ndarray()
454
455
if frame.format.is_planar:
456
# Planar format - each channel is separate plane
457
for i, plane in enumerate(frame.planes):
458
channel_data = np.frombuffer(plane, dtype=np.float32)
459
print(f"Channel {i}: {len(channel_data)} samples")
460
else:
461
# Packed format - channels interleaved
462
for i in range(frame.channels):
463
channel_data = array[i::frame.channels]
464
print(f"Channel {i}: {len(channel_data)} samples")
465
466
container.close()
467
```
468
469
### Audio Analysis
470
471
```python
472
import av
473
import numpy as np
474
475
def analyze_audio(filename):
476
container = av.open(filename)
477
stream = container.streams.audio[0]
478
479
# Collect all audio data
480
all_samples = []
481
frame_count = 0
482
483
for frame in container.decode(stream):
484
array = frame.to_ndarray()
485
all_samples.append(array)
486
frame_count += 1
487
488
# Frame-level analysis
489
rms = np.sqrt(np.mean(array**2))
490
peak = np.max(np.abs(array))
491
print(f"Frame {frame_count}: RMS={rms:.3f}, Peak={peak:.3f}")
492
493
# Overall analysis
494
if all_samples:
495
all_audio = np.concatenate(all_samples)
496
duration = len(all_audio) / stream.sample_rate
497
overall_rms = np.sqrt(np.mean(all_audio**2))
498
overall_peak = np.max(np.abs(all_audio))
499
500
print(f"\nOverall Analysis:")
501
print(f"Duration: {duration:.2f} seconds")
502
print(f"RMS Level: {overall_rms:.3f}")
503
print(f"Peak Level: {overall_peak:.3f}")
504
print(f"Dynamic Range: {20*np.log10(overall_peak/overall_rms):.1f} dB")
505
506
container.close()
507
508
# Analyze audio file
509
analyze_audio('music.wav')
510
```