Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
251 lines
12 KiB
Python
251 lines
12 KiB
Python
# Copyright 2026 HuggingFace Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import copy
|
|
import random
|
|
from inspect import signature
|
|
from unittest.mock import patch
|
|
|
|
from .multimodal_tester import MultiModalModelTest, MultiModalModelTester
|
|
from .test_modeling_common import (
|
|
floats_tensor,
|
|
ids_tensor,
|
|
is_torch_available,
|
|
torch_device,
|
|
)
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
|
|
class ALMModelTester(MultiModalModelTester):
|
|
audio_config_class = None
|
|
audio_config_key = "audio_config"
|
|
# Name under which the audio mask is passed to the model's forward (e.g. "feature_attention_mask"
|
|
# for Qwen2Audio). Leave as `None` if the model does not consume a separate audio-level mask;
|
|
# `_prepare_modality_inputs` then skips adding it to the inputs dict.
|
|
audio_mask_key = None
|
|
_required_attributes = MultiModalModelTester._required_attributes + ("audio_config_class",)
|
|
|
|
@property
|
|
def pipeline_model_mapping(self):
|
|
# TODO: @eustlb, we don't have pipeline testing for audio-text-to-text
|
|
mapping = {
|
|
"feature-extraction": self.base_model_class,
|
|
# "audio-text-to-text": self.conditional_generation_class,
|
|
}
|
|
# TODO: should we add automatic-speech-recognition with a special flag?
|
|
return mapping
|
|
|
|
def __init__(self, parent, **kwargs):
|
|
# Overrides of _TEXT_MODEL_TESTER_DEFAULTS
|
|
kwargs.setdefault("seq_length", 32)
|
|
kwargs.setdefault("pad_token_id", 1)
|
|
|
|
# ALM-specific defaults
|
|
kwargs.setdefault("feat_seq_length", 128)
|
|
kwargs.setdefault("num_mel_bins", 80)
|
|
kwargs.setdefault("audio_token_id", 0)
|
|
|
|
super().__init__(parent, **kwargs)
|
|
|
|
# -- Overridable ALM-specific hooks ------------------------------------------------------
|
|
|
|
def create_audio_features(self):
|
|
"""Create audio feature tensor. Override for different shapes (e.g. [B, T, features])."""
|
|
return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length])
|
|
|
|
def get_audio_embeds_mask(self, audio_embeds_mask):
|
|
"""Get audio embeds mask from audio mask. Override for different shapes."""
|
|
raise NotImplementedError("This method should be overridden in the subclass")
|
|
|
|
def place_audio_tokens(self, input_ids, config, num_audio_tokens):
|
|
"""Place audio placeholder tokens contiguously after BOS. Override for different placement.
|
|
|
|
Deterministic placement (position 0 reserved for BOS; audio tokens at [1:1+n]) keeps
|
|
the tail of each sequence text-only, which downstream tests (e.g. resize_token_embeddings
|
|
overwriting column -2) rely on.
|
|
"""
|
|
input_ids = input_ids.clone()
|
|
input_ids[input_ids == self.audio_token_id] = self.pad_token_id
|
|
for i in range(input_ids.shape[0]):
|
|
n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens
|
|
if 1 + int(n) > self.seq_length:
|
|
raise ValueError(
|
|
f"Cannot place {int(n)} audio tokens after BOS in a sequence of length {self.seq_length}. "
|
|
"This likely indicates a mismatch between your feature extraction/configuration and your sequence length. "
|
|
"Please ensure `seq_length` is >= the number of audio embedding positions + 1."
|
|
)
|
|
input_ids[i, 1 : 1 + int(n)] = self.audio_token_id
|
|
return input_ids
|
|
|
|
def get_audio_feature_key(self):
|
|
"""Key name for audio features in the inputs dict."""
|
|
return "input_features"
|
|
|
|
def create_audio_mask(self):
|
|
"""Create audio-level attention mask with contiguous valid regions per batch element.
|
|
|
|
Each element gets a random offset and length, producing masks like [0, 0, 1, 1, 1, 0, 0].
|
|
At least one batch index is pinned to a full-length mask.
|
|
"""
|
|
# Use a locally-seeded RNG so repeated calls within a test produce the same mask
|
|
rng = random.Random(0)
|
|
# Sample lengths in [1, feat_seq_length] and offsets in [0, feat_seq_length - length]
|
|
lengths = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length, rng=rng).abs() + 1
|
|
lengths = lengths.clamp(max=self.feat_seq_length)
|
|
|
|
# Presuming feat_seq_length is set correctly, ensure at least one batch has a full-length mask for valid audio tokens
|
|
lengths[rng.randint(0, self.batch_size - 1)] = self.feat_seq_length
|
|
offsets = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length, rng=rng).abs()
|
|
offsets = offsets % (self.feat_seq_length - lengths + 1)
|
|
|
|
positions = torch.arange(self.feat_seq_length, device=torch_device)[None, :]
|
|
audio_mask = ((positions >= offsets[:, None]) & (positions < offsets[:, None] + lengths[:, None])).long()
|
|
return audio_mask
|
|
|
|
# -- Hooks consumed by the shared base ---------------------------------------------------
|
|
|
|
@property
|
|
def _special_token_ids(self):
|
|
return super()._special_token_ids | {self.audio_token_id}
|
|
|
|
def _build_modality_sub_configs(self):
|
|
return {self.audio_config_key: self.get_audio_config()}
|
|
|
|
def _prepare_modality_inputs(self, input_ids, config):
|
|
audio_features = self.create_audio_features()
|
|
audio_mask = self.create_audio_mask()
|
|
audio_embeds_mask = self.get_audio_embeds_mask(audio_mask)
|
|
num_audio_tokens = audio_embeds_mask.sum(dim=1)
|
|
input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens)
|
|
|
|
modality_inputs = {self.get_audio_feature_key(): audio_features}
|
|
if self.audio_mask_key is not None:
|
|
modality_inputs[self.audio_mask_key] = audio_mask
|
|
return input_ids, modality_inputs
|
|
|
|
# -- Audio sub-config construction -------------------------------------------------------
|
|
|
|
@property
|
|
def audio_config_args(self):
|
|
return list(signature(self.audio_config_class.__init__).parameters.keys())
|
|
|
|
def get_audio_config(self):
|
|
kwargs = self._collect_kwargs(self.audio_config_args, self.audio_config_class)
|
|
return self.audio_config_class(**kwargs)
|
|
|
|
|
|
class ALMModelTest(MultiModalModelTest):
|
|
"""
|
|
Base test class for Audio-Language Models.
|
|
|
|
Subclasses should set:
|
|
- `model_tester_class`: The tester class (subclass of ALMModelTester)
|
|
|
|
Optional:
|
|
- `all_model_classes`: Override if not using default from model_tester
|
|
- `pipeline_model_mapping`: Override if not using default from model_tester
|
|
"""
|
|
|
|
def test_sdpa_can_dispatch_on_flash(self):
|
|
# `test_sdpa_can_dispatch_on_flash` already pops the attention mask, but we cannot simply pop the
|
|
# audio mask here since it will raise an error in `get_audio_features` (cf. `test_mismatching_num_audio_tokens`).
|
|
# Therefore we substitute a full-ones mask instead.
|
|
def full_ones_mask():
|
|
return torch.ones(
|
|
[self.model_tester.batch_size, self.model_tester.feat_seq_length],
|
|
dtype=torch.bool,
|
|
device=torch_device,
|
|
)
|
|
|
|
with patch.object(self.model_tester, "create_audio_mask", new=full_ones_mask):
|
|
super().test_sdpa_can_dispatch_on_flash()
|
|
|
|
def test_mismatching_num_audio_tokens(self):
|
|
"""
|
|
Tests that ALMs throw an error with explicit message saying what is wrong
|
|
when number of audios don't match number of audio tokens in the text.
|
|
Also we need to test multi-audio cases when one prompt has multiple audio tokens.
|
|
"""
|
|
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
|
audio_feature_key = self.model_tester.get_audio_feature_key()
|
|
audio_mask_key = self.model_tester.audio_mask_key
|
|
|
|
# Pick the batch index `create_audio_mask` pinned to full length — guaranteed to
|
|
# contribute > 0 audio tokens even for encoders that aggressively downsample
|
|
# (e.g. GlmAsr), so duplicating it in Test 2 reliably moves the audio-token total.
|
|
audio_token_id = self.model_tester.audio_token_id
|
|
dup_idx = int((input_dict["input_ids"] == audio_token_id).sum(-1).argmax().item())
|
|
|
|
for model_class in self.all_model_classes:
|
|
model = model_class(config).to(torch_device)
|
|
model.eval()
|
|
curr_input_dict = copy.deepcopy(input_dict)
|
|
_ = model(**curr_input_dict) # successful forward with no modifications
|
|
|
|
# Test 1: remove one audio but leave the audio tokens in the text
|
|
curr_input_dict[audio_feature_key] = curr_input_dict[audio_feature_key][-1:, ...]
|
|
if audio_mask_key is not None:
|
|
curr_input_dict[audio_mask_key] = curr_input_dict[audio_mask_key][-1:, ...]
|
|
with self.assertRaises(ValueError):
|
|
_ = model(**curr_input_dict)
|
|
|
|
# Test 2: add one audio but leave the audio tokens in the text
|
|
curr_input_dict = copy.deepcopy(input_dict)
|
|
curr_input_dict[audio_feature_key] = torch.cat(
|
|
[curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key][dup_idx : dup_idx + 1, ...]],
|
|
dim=0,
|
|
)
|
|
if audio_mask_key is not None:
|
|
curr_input_dict[audio_mask_key] = torch.cat(
|
|
[curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key][dup_idx : dup_idx + 1, ...]],
|
|
dim=0,
|
|
)
|
|
with self.assertRaises(ValueError):
|
|
_ = model(**curr_input_dict)
|
|
|
|
# Test 3: duplicate the text along the seq dim so each prompt has twice as many
|
|
# audio tokens, while leaving the audio features unchanged -> mismatch
|
|
curr_input_dict = copy.deepcopy(input_dict)
|
|
curr_input_dict["input_ids"] = torch.cat(
|
|
[curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1
|
|
)
|
|
curr_input_dict["attention_mask"] = torch.cat(
|
|
[curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1
|
|
)
|
|
with self.assertRaises(ValueError):
|
|
_ = model(**curr_input_dict)
|
|
|
|
# Test 4: multi-audio valid case. A prompt may contain multiple audio segments;
|
|
# all audio segments are concatenated along the batch dim on the audio side.
|
|
# Duplicating input_ids along seq dim (-> [audios, audios] per prompt) and the
|
|
# audio features along batch dim (-> batch_size * 2) must forward successfully.
|
|
curr_input_dict = copy.deepcopy(input_dict)
|
|
curr_input_dict["input_ids"] = torch.cat(
|
|
[curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1
|
|
)
|
|
curr_input_dict["attention_mask"] = torch.cat(
|
|
[curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1
|
|
)
|
|
curr_input_dict[audio_feature_key] = torch.cat(
|
|
[curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key]], dim=0
|
|
)
|
|
if audio_mask_key is not None:
|
|
curr_input_dict[audio_mask_key] = torch.cat(
|
|
[curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key]], dim=0
|
|
)
|
|
_ = model(**curr_input_dict)
|