first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,959 @@
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Qwen2.5-Omni model."""
import tempfile
import unittest
from io import BytesIO
from urllib.request import urlopen
import librosa
import pytest
import requests
from transformers import (
AutoProcessor,
Qwen2_5OmniForConditionalGeneration,
Qwen2_5OmniThinkerConfig,
Qwen2_5OmniThinkerForConditionalGeneration,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import (
Expectations,
cleanup,
require_flash_attn,
require_torch,
require_torch_accelerator,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
floats_tensor,
ids_tensor,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
class Qwen2_5OmniThinkerForConditionalGenerationTester:
def __init__(
self,
parent,
batch_size=3,
feat_seq_length=30,
num_channels=3,
image_size=14,
seq_length=39,
vision_config={
"depth": 2,
"embed_dim": 32,
"hidden_act": "quick_gelu",
"hidden_size": 32,
"out_hidden_size": 32,
"intermediate_size": 24,
"mlp_ratio": 4,
"num_heads": 4,
"patch_size": 14,
"spatial_merge_size": 1,
"temporal_patch_size": 2,
"fullatt_block_indexes": [0],
"initializer_range": 0.02,
},
audio_config={
"model_type": "qwen_omni_thinker_audio_encoder",
"d_model": 32,
"encoder_attention_heads": 4,
"encoder_ffn_dim": 32,
"encoder_layers": 2,
"num_mel_bins": 20,
"max_source_positions": 1500,
"initializer_range": 0.02,
"n_window": 100,
"output_dim": 32,
},
text_config={
"rope_parameters": {"mrope_section": [1, 1, 2], "rope_type": "default", "type": "default"},
"vocab_size": 99,
"hidden_size": 32,
"intermediate_size": 37,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_key_value_heads": 2,
"hidden_act": "silu",
"max_position_embeddings": 1024,
"rms_norm_eps": 1e-06,
"use_cache": True,
"tie_word_embeddings": False,
"rope_theta": 1000000.0,
"use_sliding_window": False,
"sliding_window": 50,
"max_window_layers": 3,
"attention_dropout": 0.0,
"pad_token_id": 0,
"initializer_range": 0.02,
},
audio_token_index=1,
image_token_index=2,
video_token_index=3,
position_id_per_seconds=25,
seconds_per_chunk=2,
audio_start_token_id=4,
audio_end_token_id=5,
user_token_id=6,
vision_start_token_id=7,
vision_end_token_id=8,
initializer_range=0.02,
):
self.parent = parent
self.audio_config = audio_config
self.vision_config = vision_config
self.text_config = text_config
self.audio_token_index = audio_token_index
self.image_token_index = image_token_index
self.video_token_index = video_token_index
self.position_id_per_seconds = position_id_per_seconds
self.seconds_per_chunk = seconds_per_chunk
self.audio_start_token_id = audio_start_token_id
self.audio_end_token_id = audio_end_token_id
self.vision_start_token_id = vision_start_token_id
self.vision_end_token_id = vision_end_token_id
self.user_token_id = user_token_id
self.initializer_range = initializer_range
self.batch_size = batch_size
self.feat_seq_length = feat_seq_length
self.num_channels = num_channels
self.image_size = image_size
self.seq_length = seq_length
self.is_training = False
# Used from `self.model_tester` by common model tests
self.num_hidden_layers = self.text_config["num_hidden_layers"]
self.hidden_size = self.text_config["hidden_size"]
self.num_attention_heads = self.text_config["num_attention_heads"]
self.vocab_size = self.text_config["vocab_size"]
def get_config(self):
return Qwen2_5OmniThinkerConfig(
audio_config=self.audio_config,
vision_config=self.vision_config,
text_config=self.text_config,
audio_token_index=self.audio_token_index,
image_token_index=self.image_token_index,
video_token_index=self.video_token_index,
position_id_per_seconds=self.position_id_per_seconds,
seconds_per_chunk=self.seconds_per_chunk,
audio_start_token_id=self.audio_start_token_id,
audio_end_token_id=self.audio_end_token_id,
vision_start_token_id=self.vision_start_token_id,
vision_end_token_id=self.vision_end_token_id,
user_token_id=self.user_token_id,
initializer_range=self.initializer_range,
)
def prepare_config_and_inputs(self):
config = self.get_config()
patch_size = config.vision_config.patch_size
temporal_patch_size = config.vision_config.temporal_patch_size
pixel_values = floats_tensor(
[
self.batch_size * (self.image_size**2) // (patch_size**2),
self.num_channels * (patch_size**2) * temporal_patch_size,
]
)
pixel_grid_thw = torch.LongTensor(
[[1, self.image_size / patch_size, self.image_size / patch_size]] * self.batch_size
).to(pixel_values.device)
input_features_values = floats_tensor(
[self.batch_size, self.audio_config["num_mel_bins"], self.feat_seq_length]
)
feature_attention_mask = torch.ones([self.batch_size, self.feat_seq_length], dtype=torch.long).to(torch_device)
return config, pixel_values, pixel_grid_thw, input_features_values, feature_attention_mask
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, pixel_grid_thw, input_features_values, feature_attention_mask = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.get_text_config().vocab_size - 3) + 3
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
# Make sure no other tokens are set to special, to prevetn flakiness
tokens_to_replace = torch.tensor(
[
config.image_token_index,
config.audio_token_index,
config.audio_start_token_id,
config.audio_end_token_id,
config.vision_start_token_id,
config.vision_end_token_id,
],
device=input_ids.device,
)
input_ids[torch.isin(input_ids, tokens_to_replace)] = config.text_config.pad_token_id
attention_mask[:, :1] = 0
# Audio token placeholders should be wrapped in start and end token ids
audio_feat_length = ((self.feat_seq_length - 1) // 2 + 1 - 2) // 2 + 1
input_ids[:, 1] = config.audio_start_token_id
input_ids[:, 2 : (2 + audio_feat_length)] = config.audio_token_index
input_ids[:, 2 + audio_feat_length] = config.audio_end_token_id
# Image token placeholders should be wrapped in start and end token ids
input_ids[:, -4:-1] = torch.tensor(
[config.vision_start_token_id, config.image_token_index, config.vision_end_token_id]
)
inputs_dict = {
"input_features": input_features_values,
"feature_attention_mask": feature_attention_mask,
"input_ids": input_ids,
"attention_mask": attention_mask,
"image_grid_thw": pixel_grid_thw,
"pixel_values": pixel_values,
}
return config, inputs_dict
def create_and_check_qwenomnithinker_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
model = Qwen2_5OmniThinkerForConditionalGeneration(config=config)
model.to(torch_device)
model.eval()
with torch.autocast(device_type=torch_device, dtype=torch.float16):
logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
pixel_values=pixel_values.to(torch.bfloat16),
return_dict=True,
)["logits"]
self.parent.assertFalse(torch.isnan(logits).any().item())
@require_torch
class Qwen2_5OmniThinkerForConditionalGenerationModelTest(
ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
):
"""
Model tester for `Qwen2_5OmniThinkerForConditionalGeneration`.
"""
all_model_classes = (Qwen2_5OmniThinkerForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Qwen2_5OmniThinkerForConditionalGeneration,) if is_torch_available() else ()
# pipeline_model_mapping = (
# {
# "any-to-any": Qwen2_5OmniForConditionalGeneration,
# "image-text-to-text": Qwen2_5OmniThinkerForConditionalGeneration,
# }
# if is_torch_available()
# else {}
# )
# FIXME @raushan Omni tests take ages because the model is big. Try to make it even smaller
pipeline_model_mapping = {}
skip_test_audio_features_output_shape = True # Qwen2_5Omni merges batch_size and audio_output_lengths in index 0
_is_composite = True
model_split_percents = [0.5, 0.9]
def setUp(self):
self.model_tester = Qwen2_5OmniThinkerForConditionalGenerationTester(self)
self.config_tester = ConfigTester(self, config_class=Qwen2_5OmniThinkerConfig, has_text_modality=False)
@unittest.skip(reason="Cpu not yet supported because in QwenOmniThinker models")
def test_disk_offload_bin(self):
pass
@unittest.skip(reason="Disk offload bin not yet supported because in QwenOmniThinker models")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Disk offload safetensors not yet supported because in QwenOmniThinker models")
def test_disk_offload_safetensors(self):
pass
@unittest.skip(reason="Correct missing keys not yet supported because in QwenOmniThinker models")
def test_correct_missing_keys(self):
pass
@unittest.skip(reason="Compile not yet supported because in QwenOmniThinker models")
@pytest.mark.torch_compile_test
def test_sdpa_can_compile_dynamic(self):
pass
@unittest.skip(reason="Sdpa dispatch not yet supported because in QwenOmniThinker models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip(reason="QwenOmniThinker does not support output_hidden_states test")
def test_model_outputs_equivalence(self):
pass
@unittest.skip("Qwen2Omni has no base model, model architecture is special")
def test_model_base_model_prefix(self):
pass
def test_sdpa_can_dispatch_composite_models(self):
# overwrite because Qwen2 is audio+text model (not vision+text)
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
text_attn = "sdpa" if model.model._supports_sdpa else "eager"
audio_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
vision_attn = "sdpa" if model.visual._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model.model.config._attn_implementation == text_attn)
self.assertTrue(model.audio_tower.config._attn_implementation == audio_attn)
self.assertTrue(model.visual.config._attn_implementation == vision_attn)
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.model.config._attn_implementation == "eager")
self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
self.assertTrue(model_eager.visual.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
def attention_mask_padding_matches_padding_free_with_position_ids(
self, attn_implementation: str, fa_kwargs: bool = False
):
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.float16]:
dummy_input = dummy_input.to(torch.bfloat16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
if 0 in inputs_dict["attention_mask"][:, -1]:
inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1)
dummy_attention_mask = inputs_dict["attention_mask"]
inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.get_text_config().pad_token_id
model = (
model_class.from_pretrained(
tmpdirname,
dtype=torch.bfloat16,
attn_implementation=attn_implementation,
)
.to(torch_device)
.eval()
)
# flatten
padfree_inputs_dict = {
"input_features": inputs_dict["input_features"],
"feature_attention_mask": inputs_dict["feature_attention_mask"],
"pixel_values": inputs_dict["pixel_values"],
"image_grid_thw": inputs_dict["image_grid_thw"],
"input_ids": inputs_dict["input_ids"][dummy_attention_mask.bool()].unsqueeze(0),
}
# add position_ids
vision_position_ids, deltas = model.get_rope_index(
input_ids=inputs_dict["input_ids"],
image_grid_thw=inputs_dict["image_grid_thw"],
attention_mask=inputs_dict["attention_mask"],
audio_seqlens=torch.sum(inputs_dict["feature_attention_mask"], dim=1),
) # [3, bs, padded-seq-len]
vision_padfree_positions = vision_position_ids[:, dummy_attention_mask.bool()].view(
3, -1
) # [3, bs*padfree-len]
text_padfree_positions = torch.cat(
[torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()]
) # [1, bs*padfree-len]
text_padfree_positions = text_padfree_positions.long().unsqueeze(0).to(torch_device)
padfree_inputs_dict["position_ids"] = torch.cat([text_padfree_positions, vision_padfree_positions])[
:, None, :
]
if fa_kwargs:
cu_seq_lens = [0] + dummy_attention_mask.sum(1).tolist()
cu_seq_lens = torch.tensor(cu_seq_lens, device=torch_device)
max_length = cu_seq_lens.diff().max().item()
padfree_inputs_dict.update(
{
"cu_seq_lens_q": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"cu_seq_lens_k": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"max_length_q": max_length,
"max_length_k": max_length,
}
)
res_padded = model(**inputs_dict, use_cache=False)
res_padfree = model(**padfree_inputs_dict, use_cache=False)
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]
# acceptable numerical instability
tol = torch.finfo(torch.bfloat16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
@unittest.skip("Cannot generate from inputs embeds")
def test_generate_from_inputs_embeds_with_static_cache(self):
pass
# TODO (joao, raushan): there are multiple standardization issues in this model that prevent this test from
# passing, fix me
@unittest.skip("Cannot handle 4D attention mask")
@pytest.mark.torch_compile_test
def test_generate_compile_model_forward_fullgraph(self):
pass
@unittest.skip("Cannot handle 4D attention mask")
def test_generate_compilation_all_outputs(self):
pass
@unittest.skip("Cannot handle 4D attention mask")
def test_generate_with_static_cache(self):
pass
@unittest.skip("Cannot handle 4D attention mask")
def test_custom_4d_attention_mask(self):
pass
def test_get_rope_index_video_with_audio(self):
image_grid_thw = torch.empty((0, 3), dtype=torch.long)
# 3 * 2 * 2 = 12 video tokens
video_grid_thw = torch.tensor([[3, 2, 2]], dtype=torch.long, device=torch_device)
# num_audio_tokens = ((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1
# i.e.: 300 audio_seqlen -> 75 audio tokens
audio_seqlens = torch.tensor([300], dtype=torch.long)
second_per_grids = torch.tensor([1.0], dtype=torch.float)
use_audio_in_video = True
# fmt: off
expected_position_ids = torch.tensor([
[[
0, 1, # text
2, 2, # vision_bos + audio_bos
# video chunk
3, 3, 3, 3,
28, 28, 28, 28,
# audio chunk
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52,
# video chunk
53, 53, 53, 53,
# audio chunk
53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
78, 78, # audio_eos + vision_eos
79, 80, # text
]],
[[
0, 1, # text
2, 2, # vision_bos + audio_bos
# video chunk
3, 3, 4, 4,
3, 3, 4, 4,
# audio chunk
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52,
# video chunk
3, 3, 4, 4,
# audio chunk
53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
78, 78, # audio_eos + vision_eos
79, 80, # text
]],
[[
0, 1, # text
2, 2, # vision_bos + audio_bos
# video chunk
3, 4, 3, 4,
3, 4, 3, 4,
# audio chunk
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52,
# video chunk
3, 4, 3, 4,
# audio chunk
53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
78, 78, # audio_eos + vision_eos
79, 80, # text
]],
], dtype=torch.long)
# fmt: on
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = torch.tensor(
[
[
100,
101,
]
+ [
config.vision_start_token_id,
config.audio_start_token_id,
]
# 1st chunk: 8 video tokens, 50 audio tokens
+ [config.video_token_id] * 2 * 2 * 2
+ [config.audio_token_id] * 50
+
# 2nd chunk: 4 video tokens, 25 audio tokens
[config.video_token_id] * 1 * 2 * 2
+ [config.audio_token_id] * 25
+ [
config.audio_end_token_id,
config.vision_end_token_id,
]
+ [
102,
103,
]
],
dtype=torch.long,
)
model = model_class(config)
position_ids, mrope_position_deltas = model.get_rope_index(
input_ids=input_ids,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
attention_mask=None,
use_audio_in_video=use_audio_in_video,
audio_seqlens=audio_seqlens,
second_per_grids=second_per_grids,
)
self.assertTrue(torch.equal(position_ids, expected_position_ids))
@require_torch
class Qwen2_5OmniModelIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
self.audio_url = (
"https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/glass-breaking-151256.mp3"
)
self.audio_url_additional = (
"https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav"
)
self.image_url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
self.messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio_url": self.audio_url},
{"type": "image", "image_url": self.image_url},
{"type": "text", "text": "What's that sound and what kind of dog is this?"},
],
}
]
self.raw_audio, _ = librosa.load(
BytesIO(urlopen(self.audio_url).read()), sr=self.processor.feature_extractor.sampling_rate
)
self.raw_audio_additional, _ = librosa.load(
BytesIO(urlopen(self.audio_url_additional).read()), sr=self.processor.feature_extractor.sampling_rate
)
self.raw_image = Image.open(requests.get(self.image_url, stream=True).raw)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
def test_small_model_integration_test(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B", dtype=torch.bfloat16, device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=text, audio=[self.raw_audio], images=[self.raw_image], return_tensors="pt", padding=True
).to(torch.bfloat16)
expected_input_ids = torch.tensor(
[
151644,
8948,
198,
2610,
525,
264,
10950,
17847,
13,
151645,
198,
151644,
872,
198,
151647,
151646,
151646,
]
)
assert torch.allclose(expected_input_ids, inputs.input_ids[0][:17], atol=3e-3)
expected_pixel_slice = torch.tensor(
[
[0.8792, 0.8792, 0.9084],
[1.1858, 1.1858, 1.2296],
[1.2004, 1.2004, 1.2150],
[1.4340, 1.4340, 1.4194],
[1.3902, 1.4048, 1.4194],
[1.5216, 1.5362, 1.5362],
],
dtype=torch.bfloat16,
device="cpu",
)
assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
# verify generation
inputs = inputs.to(torch_device)
output = model.generate(
**inputs, thinker_temperature=0, thinker_do_sample=False, return_audio=False, thinker_max_new_tokens=20
)
EXPECTED_DECODED_TEXT = Expectations({
("xpu", None): "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog is a Labrador Retriever.",
("cuda", (8, 6)): "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is a glass shattering. The dog in the picture is a Labrador Retriever.",
("rocm", (9, 4)): "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog is a Labrador Retriever.",
}).get_expectation() # fmt: skip
decoded_text = self.processor.decode(output[0], skip_special_tokens=True)
self.assertEqual(decoded_text, EXPECTED_DECODED_TEXT)
@slow
def test_small_model_integration_test_batch(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B", dtype=torch.bfloat16, device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=[text] * 2,
audio=[self.raw_audio, self.raw_audio],
images=[self.raw_image, self.raw_image],
return_tensors="pt",
padding=True,
).to(torch_device, dtype=torch.bfloat16)
output = model.generate(
**inputs, thinker_temperature=0, thinker_do_sample=False, return_audio=False, thinker_max_new_tokens=20
)
EXPECTED_DECODED_TEXTS = Expectations(
{
("xpu", 3): [
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog is a Labrador Retriever.",
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog is a Labrador Retriever.",
],
("cuda", 7) : [
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is of glass shattering, and the dog in the picture is a Labrador Retriever",
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is of glass shattering, and the dog in the picture is a Labrador Retriever",
],
("cuda", 8): [
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is a glass shattering. The dog in the picture is a Labrador Retriever.",
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is a glass shattering. The dog in the picture is a Labrador Retriever.",
],
("rocm", (9, 4)): [
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog is a Labrador Retriever.",
"system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog is a Labrador Retriever.",
],
}
).get_expectation() # fmt: skip
decoded_texts = self.processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(decoded_texts, EXPECTED_DECODED_TEXTS)
@slow
def test_small_model_integration_test_multiturn(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B", dtype=torch.bfloat16, device_map="auto"
)
messages = [
self.messages[0],
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "The sound is glass shattering, and the dog appears to be a Labrador Retriever.",
}
],
},
{
"role": "user",
"content": [
{"type": "audio", "audio_url": self.audio_url_additional},
{"type": "text", "text": "How about this one?"},
],
},
]
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=text,
audio=[self.raw_audio, self.raw_audio_additional],
images=[self.raw_image],
return_tensors="pt",
padding=True,
).to(torch_device, dtype=torch.bfloat16)
output = model.generate(
**inputs, thinker_temperature=0, thinker_do_sample=False, return_audio=False, thinker_max_new_tokens=20
)
EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog appears to be a Labrador Retriever.\nuser\nHow about this one?\nassistant\nThe sound is a cough."
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_w_audio(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B", dtype=torch.bfloat16, device_map="auto"
)
audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
messages = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
}
],
},
{
"role": "user",
"content": [{"type": "audio", "audio": audio_url}],
},
]
audio, _ = librosa.load(BytesIO(urlopen(audio_url).read()), sr=self.processor.feature_extractor.sampling_rate)
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=text, audio=[audio], return_tensors="pt", padding=True).to(
torch_device, dtype=torch.bfloat16
)
output = model.generate(
**inputs,
thinker_temperature=0,
thinker_do_sample=False,
thinker_max_new_tokens=20,
talker_max_new_tokens=10,
)
EXPECTED_DECODED_TEXTS = Expectations(
{
("xpu", None): "system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\n\nassistant\nWell, I can't really guess your age and gender just from your voice. There are so many",
("cuda", 7): "system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\n\nassistant\nWell, I can try. But it's not always that accurate. I might be able to make",
("cuda", 8): "system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\n\nassistant\nWell, I can't really guess your age and gender just from your voice. There are so many",
}
) # fmt: skip
EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
decoded_text = self.processor.decode(output[0][0], skip_special_tokens=True)
self.assertEqual(decoded_text, EXPECTED_DECODED_TEXT)
self.assertFalse(torch.isnan(output[1]).any().item())
@slow
@require_flash_attn
@require_torch_accelerator
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=[text, text],
audio=[self.raw_audio, self.raw_audio],
images=[self.raw_image, self.raw_image],
return_tensors="pt",
padding=True,
).to(torch_device)
output = model.generate(**inputs, thinker_temperature=0, thinker_do_sample=False, return_audio=False)
EXPECTED_DECODED_TEXT = Expectations({
("cuda", None): "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog appears to be a Labrador Retriever.",
("cuda", (8, 6)): "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog is a Labrador Retriever.",
("rocm", (9, 4)): "system\nYou are a helpful assistant.\nuser\nWhat's that sound and what kind of dog is this?\nassistant\nThe sound is glass shattering, and the dog is a Labrador Retriever.",
}).get_expectation() # fmt: skip
decoded_texts = self.processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(decoded_texts[0], EXPECTED_DECODED_TEXT)
self.assertEqual(decoded_texts[1], EXPECTED_DECODED_TEXT)
@require_torch
class Qwen2_5OmniToken2WavMaxPositionEmbeddingsTest(unittest.TestCase):
"""
Tests to verify that ValueError is raised when input length exceeds max_position_embeddings.
"""
@classmethod
def setUpClass(cls):
"""Create minimal DiT model config for testing - shared across all tests."""
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import Qwen2_5OmniDiTConfig
# Use minimal dimensions to reduce memory usage
# Note: enc_channels needs at least 3 elements for the ECAPA-TDNN encoder architecture
cls.config = Qwen2_5OmniDiTConfig(
hidden_size=32,
num_hidden_layers=1,
num_attention_heads=2,
head_dim=16,
ff_mult=1,
emb_dim=16,
mel_dim=16,
enc_emb_dim=16,
enc_dim=16,
enc_channels=[16, 16, 16],
enc_kernel_sizes=[3, 3, 1],
enc_dilations=[1, 1, 1],
enc_attention_channels=8,
enc_res2net_scale=2,
enc_se_channels=8,
num_embeds=100,
look_ahead_layers=[],
look_backward_layers=[0],
max_position_embeddings=100, # Small for testing
block_size=24,
repeats=2,
)
def setUp(self):
"""Create model instance for each test."""
from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import Qwen2_5OmniToken2WavDiTModel
self.model = Qwen2_5OmniToken2WavDiTModel(self.config).to(torch_device)
self.model.eval()
def tearDown(self):
"""Clean up model to free memory."""
del self.model
if torch.cuda.is_available():
torch.cuda.empty_cache()
def test_error_when_exceeding_max_position_embeddings(self):
"""Verify ValueError is raised when maximum_duration > max_position_embeddings."""
batch_size = 1
# With repeats=2 and max_position_embeddings=100, we need > 50 tokens to exceed
num_speech_tokens = 60 # Will result in 120 mel frames, exceeds max_position_embeddings=100
conditioning_vector = torch.randn(batch_size, self.config.enc_emb_dim, device=torch_device)
reference_mel = torch.randn(batch_size, 200, self.config.mel_dim, device=torch_device)
quantized_code = torch.randint(0, self.config.num_embeds, (batch_size, num_speech_tokens), device=torch_device)
with self.assertRaises(ValueError) as context:
self.model.sample(
conditioning_vector=conditioning_vector,
reference_mel_spectrogram=reference_mel,
quantized_code=quantized_code,
num_steps=2,
)
self.assertIn("exceeds `dit_config.max_position_embeddings`", str(context.exception))
self.assertIn("120", str(context.exception)) # Requested mel length
self.assertIn("100", str(context.exception)) # max_position_embeddings
def test_no_error_when_within_limits(self):
"""Verify no error when maximum_duration <= max_position_embeddings."""
batch_size = 1
# With repeats=2 and max_position_embeddings=100, 50 tokens = 100 mel frames (exactly at limit)
num_speech_tokens = 50
conditioning_vector = torch.randn(batch_size, self.config.enc_emb_dim, device=torch_device)
reference_mel = torch.randn(batch_size, 200, self.config.mel_dim, device=torch_device)
quantized_code = torch.randint(0, self.config.num_embeds, (batch_size, num_speech_tokens), device=torch_device)
# Should complete without error
output = self.model.sample(
conditioning_vector=conditioning_vector,
reference_mel_spectrogram=reference_mel,
quantized_code=quantized_code,
num_steps=2,
)
# Check output shape is valid
self.assertEqual(len(output.shape), 3)
self.assertEqual(output.shape[0], batch_size)
self.assertEqual(output.shape[1], self.config.mel_dim)
self.assertEqual(output.shape[2], 100) # 50 tokens * 2 repeats

View File

@@ -0,0 +1,355 @@
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License"),
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import unittest
import numpy as np
from huggingface_hub import hf_hub_download
from transformers import (
Qwen2_5OmniProcessor,
)
from transformers.testing_utils import (
require_av,
require_librosa,
require_torch,
require_torchaudio,
require_torchvision,
require_vision,
)
from transformers.utils import is_torch_available
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
if is_torch_available():
import torch
@require_vision
@require_torch
@require_torchaudio
@require_torchvision
class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Qwen2_5OmniProcessor
model_id = "Qwen/Qwen2.5-Omni-7B"
@classmethod
def _setup_image_processor(cls):
image_processor_class = cls._get_component_class_from_processor("image_processor")
return image_processor_class.from_pretrained(
cls.model_id, size={"shortest_edge": 28 * 28, "longest_edge": 56 * 56}
)
@classmethod
def _setup_video_processor(cls):
video_processor_class = cls._get_component_class_from_processor("video_processor")
return video_processor_class.from_pretrained(
cls.model_id, size={"shortest_edge": 28 * 28, "longest_edge": 56 * 56}
)
def prepare_audio_inputs(self, batch_size: int = 3):
"""This function prepares a list of numpy audios."""
audio_inputs = [np.random.rand(160000) * 2 - 1] * batch_size
return audio_inputs
@require_torch
def _test_apply_chat_template(
self,
modality: str,
batch_size: int,
return_tensors: str,
input_name: str,
processor_name: str,
input_data: list[str],
):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
if processor_name not in self.processor_class.get_attributes():
self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
batch_messages = [
[
{
"role": "user",
"content": [{"type": "text", "text": "Describe this."}],
},
]
] * batch_size
# Test that jinja can be applied
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), batch_size)
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
formatted_prompt_tokenized = processor.apply_chat_template(
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
)
add_special_tokens = True
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
add_special_tokens = False
tok_output = processor.tokenizer(
formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
)
expected_output = tok_output.input_ids
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
# Test that kwargs passed to processor's `__call__` are actually used
tokenized_prompt_100 = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
padding="max_length",
truncation=True,
return_tensors=return_tensors,
max_length=100,
)
self.assertEqual(len(tokenized_prompt_100[0]), 100)
# Test that `return_dict=True` returns text related inputs in the dict
out_dict_text = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
)
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
for idx, url in enumerate(input_data[:batch_size]):
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
out_dict = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
if modality == "video":
# qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
expected_video_token_count = 0
for thw in out_dict["video_grid_thw"]:
expected_video_token_count += thw[0] * thw[1] * thw[2]
mm_len = expected_video_token_count
elif modality == "audio":
mm_len = batch_size
else:
mm_len = batch_size * 1564
self.assertEqual(len(out_dict[input_name]), mm_len)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
for k in out_dict:
self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors])
@require_av
def test_apply_chat_template_video_frame_sampling(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
messages = [
[
{
"role": "user",
"content": [
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), 1)
formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
self.assertListEqual(expected_output, formatted_prompt_tokenized)
out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
# Add video URL for return dict and load with `num_frames` arg
messages[0][0]["content"].append(
{
"type": "video",
"url": url_to_local_path(
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
),
}
)
num_frames = 3
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
num_frames=num_frames,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
# Qwen pixel values are flattened, verify length matches video_grid_thw
expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict_with_video["video_grid_thw"])
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), expected_video_tokens)
# Load with `fps` arg
fps = 1
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict_with_video["video_grid_thw"])
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), expected_video_tokens)
# Load with `fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
num_frames=num_frames,
)
# Load without any arg should load the whole video
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict_with_video["video_grid_thw"])
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), expected_video_tokens)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video
messages[0][0]["content"][-1] = {
"type": "video",
"url": [
url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
],
}
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict_with_video["video_grid_thw"])
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), expected_video_tokens)
# When the inputs are frame URLs/paths we expect that those are already
# sampled and will raise an error is asked to sample again.
with self.assertRaisesRegex(
ValueError, "Sampling frames from a list of images is not supported! Set `do_sample_frames=False`"
):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
do_sample_frames=True,
)
@require_librosa
@require_av
def test_chat_template_audio_from_video(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest(f"{self.processor_class} does not support video inputs")
if "feature_extractor" not in self.processor_class.get_attributes():
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
video_file_path = hf_hub_download(
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
)
messages = [
{
"role": "user",
"content": [
{"type": "video", "path": video_file_path},
{"type": "text", "text": "Which of these animals is making the sound?"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "It is a cow."}],
},
{
"role": "user",
"content": [
{"type": "text", "text": "Tell me all about this animal."},
],
},
]
formatted_prompt = processor.apply_chat_template([messages], add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), 1) # batch size=1
out_dict = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
load_audio_from_video=True,
)
self.assertTrue(self.audio_input_name in out_dict)
self.assertTrue(self.videos_input_name in out_dict)
# should always have input_ids and attention_mask
self.assertEqual(len(out_dict["input_ids"]), 1) # batch-size=1
self.assertEqual(len(out_dict["attention_mask"]), 1) # batch-size=1
self.assertEqual(len(out_dict[self.audio_input_name]), 1) # 1 audio in the conversation
# Qwen pixel values are flattened, verify length matches video_grid_thw
expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict["video_grid_thw"])
self.assertEqual(len(out_dict[self.videos_input_name]), expected_video_tokens) # 1 video in the conversation