CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-av

Pythonic bindings for FFmpeg's libraries enabling multimedia processing with audio/video encoding, decoding, format conversion, and stream manipulation.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

audio.mddocs/

Audio Processing

Comprehensive audio handling capabilities including frames, streams, format conversion, resampling, and FIFO buffering. PyAV provides full access to FFmpeg's audio processing with NumPy integration.

Capabilities

Audio Frames

Audio frame objects contain uncompressed audio data with format and timing information.

class AudioFrame:
    """Container for uncompressed audio data."""
    
    # Properties
    samples: int                    # Number of audio samples
    sample_rate: int               # Sample rate in Hz
    rate: int                      # Alias for sample_rate
    format: AudioFormat            # Audio sample format
    layout: AudioLayout            # Channel layout
    planes: tuple[AudioPlane, ...] # Audio data planes
    pts: int                       # Presentation timestamp
    time: float                    # Time in seconds
    side_data: SideDataContainer   # Additional frame data
    
    def __init__(self, format='s16', layout='stereo', samples=0, align=1):
        """
        Create an audio frame.
        
        Parameters:
        - format: str | AudioFormat - Sample format
        - layout: str | AudioLayout - Channel layout  
        - samples: int - Number of samples per channel
        - align: int - Memory alignment
        """
    
    @staticmethod
    def from_ndarray(array, format='s16', layout='stereo') -> 'AudioFrame':
        """
        Create frame from NumPy array.
        
        Parameters:
        - array: np.ndarray - Audio data array
        - format: str - Target sample format
        - layout: str - Channel layout
        
        Returns:
        New AudioFrame object
        """
    
    def to_ndarray(self, format=None) -> np.ndarray:
        """
        Convert to NumPy array.
        
        Parameters:
        - format: str - Target format (None uses current format)
        
        Returns:
        NumPy array with audio data
        """
    
    def make_writable(self) -> None:
        """Ensure frame data is writable."""

Audio Formats

Audio sample format specifications and conversions.

class AudioFormat:
    """Audio sample format specification."""
    
    # Properties
    name: str              # Format name (e.g., 's16', 'flt')
    bytes: int             # Bytes per sample
    bits: int              # Bits per sample
    is_planar: bool        # True if planar format
    is_packed: bool        # True if packed format
    planar: 'AudioFormat'  # Equivalent planar format
    packed: 'AudioFormat'  # Equivalent packed format
    container_name: str    # Container-friendly name
    
    def __init__(self, name):
        """
        Create audio format.
        
        Parameters:
        - name: str | AudioFormat - Format name or existing format
        """

Audio Layouts

Channel layout specifications for multi-channel audio.

class AudioLayout:
    """Audio channel layout specification."""
    
    # Properties  
    name: str                       # Layout name (e.g., 'mono', 'stereo', '5.1')
    nb_channels: int                # Number of channels
    channels: tuple[AudioChannel, ...] # Individual channel objects
    
    def __init__(self, layout):
        """
        Create audio layout.
        
        Parameters:
        - layout: str | int | AudioLayout - Layout specification
        """

class AudioChannel:
    """Individual audio channel."""
    
    name: str         # Channel name (e.g., 'FL', 'FR', 'C')
    description: str  # Human-readable description

Audio Resampling

Audio format conversion and resampling for compatibility between different audio specifications.

class AudioResampler:
    """Audio format converter and resampler."""
    
    # Properties
    rate: int              # Output sample rate
    frame_size: int        # Output frame size
    format: AudioFormat    # Output format
    graph: Graph | None    # Filter graph used
    
    def __init__(self, format=None, layout=None, rate=None, frame_size=None):
        """
        Create audio resampler.
        
        Parameters:
        - format: str | AudioFormat - Output format
        - layout: str | AudioLayout - Output layout
        - rate: int - Output sample rate
        - frame_size: int - Output frame size
        """
    
    def resample(self, frame=None) -> list[AudioFrame]:
        """
        Resample audio frame.
        
        Parameters:
        - frame: AudioFrame | None - Input frame (None flushes)
        
        Returns:
        List of resampled frames
        """

Audio FIFO

First-in-first-out buffer for audio frames, useful for managing variable frame sizes.

class AudioFifo:
    """FIFO buffer for audio frames."""
    
    # Properties
    format: AudioFormat      # Audio format
    layout: AudioLayout      # Channel layout
    sample_rate: int         # Sample rate
    samples: int             # Current samples in buffer
    samples_written: int     # Total samples written
    samples_read: int        # Total samples read
    pts_per_sample: Fraction # PTS increment per sample
    
    def __init__(self, format='s16', layout='stereo', sample_rate=48000):
        """
        Create audio FIFO.
        
        Parameters:
        - format: str - Audio format
        - layout: str - Channel layout
        - sample_rate: int - Sample rate
        """
    
    def write(self, frame) -> None:
        """
        Write frame to FIFO.
        
        Parameters:
        - frame: AudioFrame - Frame to write
        """
    
    def read(self, samples=0, partial=False) -> AudioFrame | None:
        """
        Read frame from FIFO.
        
        Parameters:
        - samples: int - Number of samples to read (0 for all)
        - partial: bool - Allow partial reads
        
        Returns:
        AudioFrame or None if insufficient data
        """
    
    def read_many(self, samples, partial=True) -> list[AudioFrame]:
        """
        Read multiple frames.
        
        Parameters:
        - samples: int - Samples per frame
        - partial: bool - Allow partial final frame
        
        Returns:
        List of audio frames
        """

Audio Streams

Audio stream objects for encoding and decoding.

class AudioStream:
    """Audio stream in a container."""
    
    # Properties
    type: Literal['audio']      # Stream type
    codec_context: AudioCodecContext # Codec context
    frame_size: int             # Encoder frame size
    sample_rate: int            # Sample rate
    rate: int                   # Alias for sample_rate
    bit_rate: int              # Bitrate
    channels: int               # Number of channels
    format: AudioFormat         # Sample format
    layout: AudioLayout         # Channel layout
    
    def encode(self, frame=None) -> list[Packet]:
        """
        Encode audio frame.
        
        Parameters:
        - frame: AudioFrame | None - Frame to encode (None flushes)
        
        Returns:
        List of encoded packets
        """
    
    def decode(self, packet=None) -> list[AudioFrame]:
        """
        Decode audio packet.
        
        Parameters:
        - packet: Packet | None - Packet to decode (None flushes)
        
        Returns:
        List of decoded frames
        """

Audio Codec Context

Audio-specific codec context for encoding and decoding.

class AudioCodecContext:
    """Audio codec context."""
    
    # Properties
    type: Literal['audio']     # Context type
    frame_size: int            # Samples per frame
    sample_rate: int           # Sample rate
    rate: int                  # Alias for sample_rate  
    format: AudioFormat        # Sample format
    layout: AudioLayout        # Channel layout
    channels: int              # Number of channels
    bit_rate: int             # Target bitrate
    
    def encode(self, frame=None) -> list[Packet]:
        """Encode audio frame to packets."""
    
    def encode_lazy(self, frame=None) -> Iterator[Packet]:
        """Lazy encoding iterator."""
    
    def decode(self, packet=None) -> list[AudioFrame]:
        """Decode packet to audio frames."""

Audio Planes

Individual audio data planes for planar formats.

class AudioPlane:
    """Audio data plane."""
    
    buffer_size: int        # Size of audio buffer
    frame: AudioFrame       # Parent frame
    index: int             # Plane index
    
    # Inherits Buffer methods for data access
    def update(self, input: bytes) -> None: ...
    def __buffer__(self, flags: int) -> memoryview: ...
    def __bytes__(self) -> bytes: ...

Usage Examples

Basic Audio Processing

import av
import numpy as np

# Open audio file
container = av.open('audio.wav')
audio_stream = container.streams.audio[0]

print(f"Sample rate: {audio_stream.sample_rate}")
print(f"Channels: {audio_stream.channels}")
print(f"Format: {audio_stream.format}")

# Decode all frames
for frame in container.decode(audio_stream):
    # Convert to numpy array
    array = frame.to_ndarray()
    print(f"Frame: {array.shape} samples")
    
    # Process audio data
    processed = np.multiply(array, 0.5)  # Reduce volume
    
    # Create new frame from processed data
    new_frame = av.AudioFrame.from_ndarray(
        processed, 
        format=frame.format.name,
        layout=frame.layout.name,
        sample_rate=frame.sample_rate
    )

container.close()

Audio Format Conversion

import av

# Setup resampler
resampler = av.AudioResampler(
    format='s16',      # 16-bit signed integer
    layout='stereo',   # 2 channels
    rate=44100         # 44.1kHz
)

# Open input
container = av.open('input.flac')
stream = container.streams.audio[0]

# Process frames
for frame in container.decode(stream):
    # Resample to target format
    resampled_frames = resampler.resample(frame)
    
    for resampled_frame in resampled_frames:
        print(f"Resampled: {resampled_frame.format.name} "
              f"{resampled_frame.layout.name} "
              f"{resampled_frame.sample_rate}Hz")

# Flush resampler
final_frames = resampler.resample(None)
for frame in final_frames:
    print(f"Final frame: {frame.samples} samples")

container.close()

Audio Encoding

import av
import numpy as np

# Create output container
output = av.open('output.aac', 'w')

# Add audio stream
stream = output.add_stream('aac', rate=44100)
stream.channels = 2
stream.layout = 'stereo'
stream.sample_rate = 44100

# Create FIFO for frame size management
fifo = av.AudioFifo(
    format=stream.format.name,
    layout=stream.layout.name,
    sample_rate=stream.sample_rate
)

# Generate audio data
duration = 5.0  # seconds
sample_count = int(duration * stream.sample_rate)
t = np.linspace(0, duration, sample_count)
frequency = 440  # A4 note

# Generate stereo sine wave
left_channel = np.sin(2 * np.pi * frequency * t) * 0.3
right_channel = np.sin(2 * np.pi * frequency * 1.5 * t) * 0.3
audio_data = np.column_stack([left_channel, right_channel])

# Create frame and write to FIFO
frame = av.AudioFrame.from_ndarray(
    audio_data.astype(np.float32),
    format='flt',
    layout='stereo',
    sample_rate=stream.sample_rate
)
fifo.write(frame)

# Read and encode in codec-appropriate frame sizes
frame_count = 0
while fifo.samples >= stream.frame_size:
    frame = fifo.read(stream.frame_size)
    frame.pts = frame_count * stream.frame_size
    frame.time_base = stream.time_base
    
    for packet in stream.encode(frame):
        output.mux(packet)
    
    frame_count += 1

# Flush encoder
for packet in stream.encode():
    output.mux(packet)

output.close()

Multi-Channel Audio Processing

import av
import numpy as np

# Open 5.1 surround sound file
container = av.open('surround.ac3')
stream = container.streams.audio[0]

print(f"Layout: {stream.layout.name}")
print(f"Channels: {stream.channels}")
for i, channel in enumerate(stream.layout.channels):
    print(f"  Channel {i}: {channel.name} ({channel.description})")

# Process each channel separately
for frame in container.decode(stream):
    array = frame.to_ndarray()
    
    if frame.format.is_planar:
        # Planar format - each channel is separate plane
        for i, plane in enumerate(frame.planes):
            channel_data = np.frombuffer(plane, dtype=np.float32)
            print(f"Channel {i}: {len(channel_data)} samples")
    else:
        # Packed format - channels interleaved
        for i in range(frame.channels):
            channel_data = array[i::frame.channels]
            print(f"Channel {i}: {len(channel_data)} samples")

container.close()

Audio Analysis

import av
import numpy as np

def analyze_audio(filename):
    container = av.open(filename)
    stream = container.streams.audio[0]
    
    # Collect all audio data
    all_samples = []
    frame_count = 0
    
    for frame in container.decode(stream):
        array = frame.to_ndarray()
        all_samples.append(array)
        frame_count += 1
        
        # Frame-level analysis
        rms = np.sqrt(np.mean(array**2))
        peak = np.max(np.abs(array))
        print(f"Frame {frame_count}: RMS={rms:.3f}, Peak={peak:.3f}")
    
    # Overall analysis
    if all_samples:
        all_audio = np.concatenate(all_samples)
        duration = len(all_audio) / stream.sample_rate
        overall_rms = np.sqrt(np.mean(all_audio**2))
        overall_peak = np.max(np.abs(all_audio))
        
        print(f"\nOverall Analysis:")
        print(f"Duration: {duration:.2f} seconds")
        print(f"RMS Level: {overall_rms:.3f}")
        print(f"Peak Level: {overall_peak:.3f}")
        print(f"Dynamic Range: {20*np.log10(overall_peak/overall_rms):.1f} dB")
    
    container.close()

# Analyze audio file
analyze_audio('music.wav')

Install with Tessl CLI

npx tessl i tessl/pypi-av

docs

audio.md

codecs.md

containers.md

filters.md

index.md

streams.md

video.md

tile.json