`vllm.multimodal.media.audio` ¶

Classes:

AudioEmbeddingMediaIO –

Configuration values can be user-provided either by --media-io-kwargs or
AudioMediaIO –

Configuration values can be user-provided either by --media-io-kwargs or

Functions:

load_audio_pyav –

Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
load_audio_soundfile –

Load audio via soundfile

`AudioEmbeddingMediaIO` ¶

Bases: MediaIO[Tensor]

Configuration values can be user-provided either by --media-io-kwargs or by the runtime API field "media_io_kwargs". Ensure proper validation and error handling.

Source code in vllm/multimodal/media/audio.py

class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
    """Configuration values can be user-provided either by --media-io-kwargs or
    by the runtime API field "media_io_kwargs". Ensure proper validation and
    error handling.
    """

    def __init__(self) -> None:
        super().__init__()

    def load_bytes(self, data: bytes) -> torch.Tensor:
        buffer = BytesIO(data)
        # Enable sparse tensor integrity checks to prevent out-of-bounds
        # writes from maliciously crafted tensors
        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.load(buffer, weights_only=True)
            return tensor.to_dense()

    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
        return self.load_bytes(pybase64.b64decode(data, validate=True))

    def load_file(self, filepath: Path) -> torch.Tensor:
        # Enable sparse tensor integrity checks to prevent out-of-bounds
        # writes from maliciously crafted tensors
        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.load(filepath, weights_only=True)
            return tensor.to_dense()

    def encode_base64(self, media: torch.Tensor) -> str:
        return tensor2base64(media)

`AudioMediaIO` ¶

Bases: MediaIO[tuple[NDArray, float]]

Configuration values can be user-provided either by --media-io-kwargs or by the runtime API field "media_io_kwargs". Ensure proper validation and error handling.

Source code in vllm/multimodal/media/audio.py

class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
    """Configuration values can be user-provided either by --media-io-kwargs or
    by the runtime API field "media_io_kwargs". Ensure proper validation and
    error handling.
    """

    def __init__(self, **kwargs) -> None:
        super().__init__()

        # `kwargs` contains custom arguments from
        # --media-io-kwargs for this modality, merged with
        # per-request runtime media_io_kwargs via merge_kwargs().
        # They can be passed to the underlying
        # media loaders (e.g. custom implementations)
        # for flexible control.
        self.kwargs = kwargs

    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
        return load_audio(BytesIO(data), sr=None)

    def load_base64(
        self,
        media_type: str,
        data: str,
    ) -> tuple[npt.NDArray, float]:
        return self.load_bytes(pybase64.b64decode(data))

    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
        return load_audio(filepath, sr=None)

    def encode_base64(
        self,
        media: tuple[npt.NDArray, int],
        *,
        audio_format: str = "WAV",
    ) -> str:
        audio, sr = media

        with BytesIO() as buffer:
            soundfile.write(buffer, audio, sr, format=audio_format)
            data = buffer.getvalue()

        return pybase64.b64encode(data).decode("utf-8")

`load_audio_pyav(path, *, sr=22050, mono=True, max_duration_s=None)` ¶

Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.

Decodes the audio stream at its native sample rate. Channel reduction to mono is performed by averaging across channels. Resampling to a model-specific rate is left to the downstream :class:AudioResampler.

Parameters:

path ¶
(BytesIO | Path | str) –

A :class:~io.BytesIO buffer, a filesystem :class:~pathlib.Path, or a string path.
max_duration_s ¶
(float | None, default: None ) –

If set, abort decoding once the accumulated sample count exceeds this many seconds of audio. Prevents decompression-bomb attacks where a small compressed file expands into gigabytes of PCM.

Returns:

NDArray –

(waveform, sample_rate) where waveform is a 1-D float32
float –

NumPy array and sample_rate is the native sample rate in Hz.

Source code in vllm/multimodal/media/audio.py

def load_audio_pyav(
    path: BytesIO | Path | str,
    *,
    sr: float | None = 22050,
    mono: bool = True,
    max_duration_s: float | None = None,
) -> tuple[npt.NDArray, float]:
    """Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.

    Decodes the audio stream at its native sample rate. Channel reduction to
    mono is performed by averaging across channels.  Resampling to a
    model-specific rate is left to the downstream :class:`AudioResampler`.

    Args:
        path: A :class:`~io.BytesIO` buffer, a filesystem
            :class:`~pathlib.Path`, or a string path.
        max_duration_s: If set, abort decoding once the accumulated
            sample count exceeds this many seconds of audio.  Prevents
            decompression-bomb attacks where a small compressed file
            expands into gigabytes of PCM.

    Returns:
        ``(waveform, sample_rate)`` where *waveform* is a 1-D float32
        NumPy array and *sample_rate* is the native sample rate in Hz.
    """
    native_sr = None
    try:
        with av.open(path) as container:
            if not container.streams.audio:
                raise ValueError("No audio stream found.")
            stream = container.streams.audio[0]
            stream.thread_type = "AUTO"
            native_sr = stream.rate
            sr = sr or native_sr

            # Early rejection from container/stream metadata to avoid
            # wasting resources on decoding decompression bombs.
            if max_duration_s is not None:
                metadata_duration_s = None
                if stream.duration and stream.time_base:
                    metadata_duration_s = float(stream.duration * stream.time_base)
                elif container.duration:
                    metadata_duration_s = container.duration / 1_000_000
                if (
                    metadata_duration_s is not None
                    and metadata_duration_s > max_duration_s
                ):
                    raise ValueError(
                        f"Audio exceeds maximum allowed duration of "
                        f"{max_duration_s}s (metadata reports "
                        f"{metadata_duration_s:.1f}s). This limit "
                        f"prevents decompression-bomb attacks."
                    )

            max_samples = (
                int(sr * max_duration_s) if max_duration_s is not None else None
            )
            total_samples = 0

            chunks: list[npt.NDArray] = []
            needs_resampling = not math.isclose(
                float(sr),
                float(native_sr),
                rel_tol=0.0,
                abs_tol=1e-6,
            )
            resampler = (
                av.AudioResampler(format="fltp", layout="mono", rate=sr)
                if needs_resampling
                else None
            )
            for frame in container.decode(stream):
                if needs_resampling:
                    assert resampler is not None
                    for out_frame in resampler.resample(frame):
                        arr = out_frame.to_ndarray()
                        total_samples += arr.shape[-1]
                        chunks.append(arr)
                else:
                    arr = frame.to_ndarray()
                    total_samples += arr.shape[-1]
                    chunks.append(arr)

                if max_samples is not None and total_samples > max_samples:
                    raise ValueError(
                        f"Audio exceeds maximum allowed duration of "
                        f"{max_duration_s}s (decoded {total_samples} "
                        f"samples at {sr}Hz). This limit prevents "
                        f"decompression-bomb attacks."
                    )
    except (ValueError, ImportError):
        raise
    except Exception as e:
        raise ValueError(
            "Invalid or corrupted video data when extracting audio. "
            "Ensure the input is valid video bytes (e.g. a complete MP4)."
        ) from e

    if not chunks:
        raise ValueError("No audio found in the video.")

    audio = np.concatenate(chunks, axis=-1).astype(np.float32)
    if mono and audio.ndim > 1:
        audio = np.mean(audio, axis=0)

    return audio, sr

`load_audio_soundfile(path, *, sr=22050, mono=True, max_duration_s=None)` ¶

Load audio via soundfile

Source code in vllm/multimodal/media/audio.py

def load_audio_soundfile(
    path: BytesIO | Path | str,
    *,
    sr: float | None = 22050,
    mono: bool = True,
    max_duration_s: float | None = None,
) -> tuple[np.ndarray, int]:
    """Load audio via soundfile"""
    with soundfile.SoundFile(path) as f:
        native_sr = f.samplerate
        if max_duration_s is not None:
            file_duration_s = f.frames / native_sr
            if file_duration_s > max_duration_s:
                raise ValueError(
                    f"Audio exceeds maximum allowed duration of "
                    f"{max_duration_s}s (file contains "
                    f"{file_duration_s:.1f}s at {native_sr}Hz). "
                    f"This limit prevents decompression-bomb attacks."
                )
        y = f.read(dtype="float32", always_2d=False).T

    if mono and y.ndim > 1:
        y = np.mean(y, axis=tuple(range(y.ndim - 1)))

    if sr is not None and sr != native_sr:
        y = resample_audio_pyav(y, orig_sr=native_sr, target_sr=sr)
        return y, int(sr)
    return y, native_sr

`vllm.multimodal.media.audio` ¶

`AudioEmbeddingMediaIO` ¶

`AudioMediaIO` ¶

`load_audio_pyav(path, *, sr=22050, mono=True, max_duration_s=None)` ¶

`path` ¶

`max_duration_s` ¶

`load_audio_soundfile(path, *, sr=22050, mono=True, max_duration_s=None)` ¶

vllm.multimodal.media.audio ¶

AudioEmbeddingMediaIO ¶

AudioMediaIO ¶

load_audio_pyav(path, *, sr=22050, mono=True, max_duration_s=None) ¶

path ¶

max_duration_s ¶

load_audio_soundfile(path, *, sr=22050, mono=True, max_duration_s=None) ¶

`vllm.multimodal.media.audio` ¶

`AudioEmbeddingMediaIO` ¶

`AudioMediaIO` ¶

`load_audio_pyav(path, *, sr=22050, mono=True, max_duration_s=None)` ¶

`path` ¶

`max_duration_s` ¶

`load_audio_soundfile(path, *, sr=22050, mono=True, max_duration_s=None)` ¶