first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,772 @@
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Qwen2.5-VL model."""
import copy
import tempfile
import unittest
import pytest
from transformers import (
AutoProcessor,
Qwen2_5_VLConfig,
Qwen2_5_VLForConditionalGeneration,
Qwen2_5_VLModel,
is_torch_available,
)
from transformers.image_utils import load_image
from transformers.testing_utils import (
Expectations,
cleanup,
require_flash_attn,
require_torch,
require_torch_accelerator,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
floats_tensor,
ids_tensor,
)
from ...test_processing_common import url_to_local_path
if is_torch_available():
import torch
class Qwen2_5_VLVisionText2TextModelTester:
def __init__(
self,
parent,
batch_size=3,
seq_length=7,
num_channels=3,
ignore_index=-100,
image_size=14,
bos_token_id=0,
eos_token_id=1,
pad_token_id=2,
hidden_act="silu",
hidden_size=32,
vocab_size=99,
intermediate_size=37,
max_position_embeddings=512,
max_window_layers=3,
num_attention_heads=4,
num_hidden_layers=2,
num_key_value_heads=2,
rope_theta=10000,
tie_word_embeddings=True,
is_training=True,
vision_config=None,
rope_parameters=None,
vision_start_token_id=3,
image_token_id=4,
video_token_id=5,
):
self.parent = parent
self.ignore_index = ignore_index
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_size = hidden_size
self.vision_start_token_id = vision_start_token_id
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.is_training = is_training
self.vocab_size = vocab_size
self.num_image_tokens = 32
self.seq_length = seq_length + self.num_image_tokens
# Default vision config is None to avoid a mutable default argument
if vision_config is None:
vision_config = {
"depth": 2,
"in_chans": 3,
"hidden_act": "silu",
"intermediate_size": 32,
"out_hidden_size": 32,
"hidden_size": 32,
"num_heads": 4,
"patch_size": 14,
"spatial_patch_size": 14,
"spatial_merge_size": 1,
"temporal_patch_size": 2,
"tokens_per_second": 1,
}
self.vision_config = vision_config
self.text_config = {
"bos_token_id": bos_token_id,
"eos_token_id": eos_token_id,
"pad_token_id": pad_token_id,
"hidden_act": hidden_act,
"hidden_size": hidden_size,
"intermediate_size": intermediate_size,
"max_position_embeddings": max_position_embeddings,
"max_window_layers": max_window_layers,
"num_attention_heads": num_attention_heads,
"num_hidden_layers": num_hidden_layers,
"num_key_value_heads": num_key_value_heads,
"rope_theta": rope_theta,
"tie_word_embeddings": tie_word_embeddings,
"vocab_size": vocab_size,
"rope_parameters": {"type": "mrope", "mrope_section": [2, 1, 1]},
}
def get_config(self):
return Qwen2_5_VLConfig(
text_config=self.text_config,
vision_config=self.vision_config,
vision_start_token_id=self.vision_start_token_id,
image_token_id=self.image_token_id,
video_token_id=self.video_token_id,
)
def prepare_config_and_inputs(self):
config = self.get_config()
patch_size = config.vision_config.patch_size
temporal_patch_size = config.vision_config.temporal_patch_size
pixel_values = floats_tensor(
[
self.batch_size * (self.image_size**2) // (patch_size**2),
self.num_channels * (patch_size**2) * temporal_patch_size,
]
)
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
input_ids[:, -1] = self.pad_token_id
input_ids[input_ids == self.video_token_id] = self.pad_token_id
input_ids[input_ids == self.image_token_id] = self.pad_token_id
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
input_ids[:, self.num_image_tokens] = self.image_token_id
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
mm_token_type_ids = torch.zeros_like(input_ids)
mm_token_type_ids[:, self.num_image_tokens] = 1
inputs_dict = {
"pixel_values": pixel_values,
"image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size, device=torch_device),
"input_ids": input_ids,
"attention_mask": attention_mask,
"mm_token_type_ids": mm_token_type_ids,
}
return config, inputs_dict
@require_torch
class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `Qwen2_5_VLForConditionalGeneration`.
"""
all_model_classes = (
(
Qwen2_5_VLModel,
Qwen2_5_VLForConditionalGeneration,
)
if is_torch_available()
else ()
)
def setUp(self):
self.model_tester = Qwen2_5_VLVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=Qwen2_5_VLConfig, has_text_modality=False)
def test_config(self):
self.config_tester.run_common_tests()
def test_mismatching_num_image_tokens(self):
"""
Tests that VLMs through an error with explicit message saying what is wrong
when number of images don't match number of image tokens in the text.
Also we need to test multi-image cases when one prompr has multiple image tokens.
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
model.eval()
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict)
# remove one image but leave the image token in text
patch_size = config.vision_config.patch_size
one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-one_img_length:, ...]
curr_input_dict["image_grid_thw"] = curr_input_dict["image_grid_thw"][-1:, ...]
with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"):
_ = model(**curr_input_dict)
model.base_model.rope_deltas = None
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:one_img_length]
image_grid_thw = curr_input_dict["image_grid_thw"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"):
_ = model(
input_ids=input_ids,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw,
)
model.base_model.rope_deltas = None
# two images and two image tokens don't raise an error
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
image_grid_thw = torch.cat([image_grid_thw, image_grid_thw], dim=0)
_ = model(
input_ids=input_ids,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw,
)
def test_vision_position_ids(self):
"""
Tests that vision position ids are built correctly for images and for videos.
See https://github.com/huggingface/transformers/pull/45400
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = Qwen2_5_VLModel(config).to(torch_device)
batch_size = input_dict["input_ids"].shape[0]
# Test most simple case when num_image_tokens == 1. Position ids will be sunsequent and text-like
position_ids = model.get_rope_index(
input_dict["input_ids"], input_dict["mm_token_type_ids"], input_dict["image_grid_thw"]
)[0]
expected_positions = torch.arange(39)[None, None, :].repeat(3, batch_size, 1)
self.assertListEqual(list(position_ids.shape), [3, batch_size, 39])
self.assertListEqual(position_ids.tolist(), expected_positions.tolist())
# Each image encodes to more than 1 token (i.e. 4 height and 3 width patches = 12 tokens)
image_token_id = config.image_token_id
pad_token_id = config.text_config.pad_token_id
input_ids = torch.tensor([[pad_token_id] + [image_token_id] * 12 + [pad_token_id]], device=torch_device)
mm_token_type_ids = torch.tensor([[0] + [1] * 12 + [0]], device=torch_device)
image_grid_thw = torch.tensor([[1, 4, 3]], device=torch_device)
position_ids = model.get_rope_index(input_ids, mm_token_type_ids, image_grid_thw)[0]
expected_positions = torch.tensor(
[
[[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5]],
[[0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5]],
[[0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 5]],
]
)
self.assertListEqual(list(position_ids.shape), [3, 1, 14])
self.assertListEqual(position_ids.tolist(), expected_positions.tolist())
# Check video position ids with 2 frames, and 4 height, 3 width patches (= 12 * 2 tokens)
video_token_id = config.video_token_id
input_ids = torch.tensor([[pad_token_id] + [video_token_id] * 24 + [pad_token_id]], device=torch_device)
mm_token_type_ids = torch.tensor([[0] + [2] * 24 + [0]], device=torch_device)
video_grid_thw = torch.tensor([[2, 4, 3]], device=torch_device)
second_per_grid_ts = torch.tensor([3], device=torch_device)
position_ids = model.get_rope_index(
input_ids, mm_token_type_ids, video_grid_thw=video_grid_thw, second_per_grid_ts=second_per_grid_ts
)[0]
expected_positions = torch.tensor(
[
[[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5]],
[[0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5]],
[[0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 5]],
]
)
self.assertListEqual(list(position_ids.shape), [3, 1, 26])
self.assertListEqual(position_ids.tolist(), expected_positions.tolist())
def test_video_forward(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
B = self.model_tester.batch_size
C = config.vision_config.in_chans
T = config.vision_config.temporal_patch_size
P = config.vision_config.patch_size
input_ids = ids_tensor([B, self.model_tester.seq_length], self.model_tester.vocab_size)
F = 4
patch_H = self.model_tester.image_size // P
patch_W = self.model_tester.image_size // P
patch_T = F // T
patches_per_video = patch_T * patch_H * patch_W
pixel_values_videos = floats_tensor(
[
# first dim: batch_size * num_patches
B * patches_per_video,
# second dim: in_channels * temporal_patch_size * patch_size^2
C * T * (P**2),
]
)
video_grid_thw = torch.tensor([[patch_T, patch_H, patch_W]] * B, device=torch_device)
# sanity check
assert pixel_values_videos.shape[0] == video_grid_thw.prod(dim=1).sum().item()
# Insert video token sequence
input_ids[:, -1] = self.model_tester.pad_token_id
input_ids[input_ids == self.model_tester.video_token_id] = self.model_tester.pad_token_id
input_ids[input_ids == self.model_tester.image_token_id] = self.model_tester.pad_token_id
input_ids[input_ids == self.model_tester.vision_start_token_id] = self.model_tester.pad_token_id
input_ids[:, self.model_tester.num_image_tokens] = self.model_tester.video_token_id
insertion_point = self.model_tester.num_image_tokens
assert (B * patches_per_video) + insertion_point <= self.model_tester.seq_length
for b in range(B):
input_ids[b, insertion_point - 1] = self.model_tester.vision_start_token_id
input_ids[b, insertion_point : insertion_point + patches_per_video] = self.model_tester.video_token_id
for model_class in self.all_model_classes:
second_per_grid_ts = torch.tensor([1.0] * B, device=torch_device)
model = model_class(config).to(torch_device)
outputs = model(
input_ids=input_ids,
pixel_values_videos=pixel_values_videos,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
)
self.assertIsNotNone(outputs)
def attention_mask_padding_matches_padding_free_with_position_ids(
self, attn_implementation: str, fa_kwargs: bool = False
):
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.float16]:
dummy_input = dummy_input.to(torch.bfloat16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
if 0 in inputs_dict["attention_mask"][:, -1]:
inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1)
dummy_attention_mask = inputs_dict["attention_mask"]
inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.get_text_config().pad_token_id
model = (
model_class.from_pretrained(
tmpdirname,
dtype=torch.bfloat16,
attn_implementation=attn_implementation,
)
.to(torch_device)
.eval()
)
# flatten
padfree_inputs_dict = {
"pixel_values": inputs_dict["pixel_values"],
"image_grid_thw": inputs_dict["image_grid_thw"],
"input_ids": inputs_dict["input_ids"][dummy_attention_mask.bool()].unsqueeze(0),
}
# add position_ids
vision_position_ids, deltas = model.model.get_rope_index(
input_ids=inputs_dict["input_ids"],
image_grid_thw=inputs_dict["image_grid_thw"],
attention_mask=inputs_dict["attention_mask"],
mm_token_type_ids=inputs_dict["mm_token_type_ids"],
) # [3, bs, padded-seq-len]
vision_padfree_positions = vision_position_ids[:, dummy_attention_mask.bool()].view(
3, -1
) # [3, bs*padfree-len]
text_padfree_positions = torch.cat(
[torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()]
) # [1, bs*padfree-len]
text_padfree_positions = text_padfree_positions.long().unsqueeze(0).to(torch_device)
padfree_inputs_dict["position_ids"] = torch.cat([text_padfree_positions, vision_padfree_positions])[
:, None, :
]
if fa_kwargs:
cu_seq_lens = [0] + dummy_attention_mask.sum(1).tolist()
cu_seq_lens = torch.tensor(cu_seq_lens, device=torch_device)
max_length = cu_seq_lens.diff().max().item()
padfree_inputs_dict.update(
{
"cu_seq_lens_q": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"cu_seq_lens_k": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"max_length_q": max_length,
"max_length_k": max_length,
}
)
res_padded = model(**inputs_dict, use_cache=False)
res_padfree = model(**padfree_inputs_dict, use_cache=False)
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]
# acceptable numerical instability
tol = torch.finfo(torch.bfloat16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
@unittest.skip(reason="Feedforward chunking is not yet supported")
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="CPU offload is not yet supported")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_disk_offload_bin(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_disk_offload_safetensors(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_model_parallelism(self):
pass
@unittest.skip(reason="Compile not yet supported because in Qwen2_5_VL models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
def test_multi_gpu_data_parallel_forward(self):
pass
@require_torch
class Qwen2_5_VLIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
self.processor.tokenizer.padding_side = "left"
self.messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What kind of dog is this?"},
],
}
]
img_url = url_to_local_path("https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg")
self.image = load_image(img_url).convert("RGB")
cleanup(torch_device, gc_collect=True)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
def test_small_model_integration_test(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt")
expected_input_ids = [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151655] # fmt: skip
torch.testing.assert_close(expected_input_ids, inputs.input_ids[0].tolist()[:17])
expected_pixel_slice = torch.tensor(
[
[0.8792, 0.8792, 0.9084],
[1.1858, 1.1858, 1.2296],
[1.2004, 1.2004, 1.2150],
[1.4340, 1.4340, 1.4194],
[1.3902, 1.4048, 1.4194],
[1.5216, 1.5362, 1.5362],
],
dtype=torch.float32,
device="cpu",
)
torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=5e-4, rtol=1e-5)
# verify generation
inputs = inputs.to(torch_device)
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in"
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
expected_decoded_texts = [
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in",
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in",
]
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
expected_decoded_texts,
)
@slow
def test_small_model_integration_test_expand(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
output = model.generate(**inputs, max_new_tokens=30, num_return_sequences=3)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch_wo_image(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
messages2 = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who are you?"},
]
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
expected_decoded_texts = Expectations(
{
(None, None): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, an AI language model created by Alibaba Cloud. I am designed to assist with various tasks such as answering questions, providing information,'
],
("rocm", (9, 4)): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to assist with a wide range of tasks, from answering questions and'
],
("xpu", None): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to assist with a wide range of tasks, from answering questions and'
],
}
).get_expectation() # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
expected_decoded_texts,
)
@slow
def test_small_model_integration_test_batch_different_resolutions(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
text2 = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
image2 = self.image.resize((224, 224))
inputs = self.processor(
text=[text, text2],
images=[self.image, image2],
padding=True,
return_tensors="pt",
).to(torch_device)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
expected_decoded_texts = Expectations(
{
(None, None): [
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in",
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\n addCriterion\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and gentle nature, which is",
],
("cuda", (8, 6)): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and gentle nature, which is evident in',
],
("rocm", None): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and gentle nature, which is evident in',
],
("xpu", None): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and gentle nature, which is evident in',
],
}
).get_expectation() # fmt: skip
decoded_texts = self.processor.batch_decode(output, skip_special_tokens=True)
for i, (expected, decoded) in enumerate(zip(expected_decoded_texts, decoded_texts)):
self.assertEqual(
decoded,
expected,
f"Decoded text {i}:\n{repr(decoded)}\ndoes not match expected decoded text:\n{repr(expected)}",
)
@slow
@require_flash_attn
@require_torch_accelerator
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
expected_decoded_text = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in"
# Since the test is to generate twice the same text, we just test twice against the expected decoded text
decoded_texts = self.processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(decoded_texts[0], expected_decoded_text)
self.assertEqual(decoded_texts[1], expected_decoded_text)
@slow
@require_flash_attn
@require_torch_accelerator
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
messages2 = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who are you?"},
]
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
# FIXME: The second decoded text in the CUDA expectation seems to be incorrect, it used to be the second text
# on the ROCm expectation that was the correct one. Either model changed or code is buggy.
EXPECTED_DECODED_TEXT = Expectations({
("cuda", None): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
"system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\n<EFBFBD>\n\n addCriterion\nI'm sorry, but I don't understand your question. Could you please provide more context or clarify what you're asking",
],
("rocm", (9, 4)): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to assist with a wide range of tasks, from answering questions and'
],
("xpu", None): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in',
"system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to assist with a wide range of tasks, from answering questions and",
],
}).get_expectation() # fmt: skip
decoded_text = self.processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(decoded_text, EXPECTED_DECODED_TEXT)
@slow
def test_small_model_integration_test_with_video(self):
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", dtype="auto", device_map="auto"
)
video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
messages = [
{
"role": "user",
"content": [
{"type": "video", "url": video_url},
{"type": "text", "text": "What is shown in this video?"},
],
}
]
inputs = self.processor.apply_chat_template(
messages,
tokenize=True,
return_dict=True,
add_generation_prompt=True,
return_tensors="pt",
num_frames=10,
).to(torch_device)
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
expected_decoded_texts = Expectations(
{
(None, None): [
'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows two individuals playing tennis on an indoor court. The player in the foreground, dressed in a white shirt and black shorts, is preparing to',
],
("xpu", None): [
'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on the service line, preparing to serve. The individual appears to be practicing or warming up,',
],
}
).get_expectation() # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
expected_decoded_texts,
)

View File

@@ -0,0 +1,281 @@
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import unittest
import numpy as np
from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
if is_vision_available():
from transformers import Qwen2_5_VLProcessor
if is_torch_available():
import torch
@require_vision
@require_torch
@require_torchvision
class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Qwen2_5_VLProcessor
model_id = "Qwen/Qwen2-VL-7B-Instruct"
@classmethod
def _setup_from_pretrained(cls, model_id, **kwargs):
return super()._setup_from_pretrained(model_id, patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28, **kwargs)
@classmethod
def _setup_test_attributes(cls, processor):
cls.image_token = processor.image_token
def test_get_num_vision_tokens(self):
"Tests general functionality of the helper used internally in vLLM"
processor = self.get_processor()
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
self.assertTrue("num_image_tokens" in output)
self.assertEqual(len(output["num_image_tokens"]), 3)
self.assertTrue("num_image_patches" in output)
self.assertEqual(len(output["num_image_patches"]), 3)
@require_torch
@require_av
def _test_apply_chat_template(
self,
modality: str,
batch_size: int,
return_tensors: str,
input_name: str,
processor_name: str,
input_data: list[str],
):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
if processor_name not in self.processor_class.get_attributes():
self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
batch_messages = [
[
{
"role": "user",
"content": [{"type": "text", "text": "Describe this."}],
},
]
] * batch_size
# Test that jinja can be applied
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), batch_size)
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
formatted_prompt_tokenized = processor.apply_chat_template(
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
)
add_special_tokens = True
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
add_special_tokens = False
tok_output = processor.tokenizer(
formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
)
expected_output = tok_output.input_ids
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
# Test that kwargs passed to processor's `__call__` are actually used
tokenized_prompt_100 = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
padding="max_length",
truncation=True,
return_tensors=return_tensors,
max_length=100,
)
self.assertEqual(len(tokenized_prompt_100[0]), 100)
# Test that `return_dict=True` returns text related inputs in the dict
out_dict_text = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
)
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
for idx, url in enumerate(input_data[:batch_size]):
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
out_dict = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
if modality == "video":
# qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
expected_video_token_count = 0
for thw in out_dict["video_grid_thw"]:
expected_video_token_count += thw[0] * thw[1] * thw[2]
mm_len = expected_video_token_count
else:
mm_len = batch_size * 192
self.assertEqual(len(out_dict[input_name]), mm_len)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
for k in out_dict:
self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors])
@require_av
def test_apply_chat_template_video_frame_sampling(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
messages = [
[
{
"role": "user",
"content": [
{
"type": "video",
"url": url_to_local_path(
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
),
},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), 1)
# Add video URL for return dict and load with `num_frames` arg
num_frames = 3
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
num_frames=num_frames,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
# Load with `fps` arg
fps = 1
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
# Load with `fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
num_frames=num_frames,
)
# Load without any arg should load the whole video
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video
messages[0][0]["content"][0] = {
"type": "video",
"url": [
url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
],
}
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
# When the inputs are frame URLs/paths we expect that those are already
# sampled and will raise an error is asked to sample again.
with self.assertRaisesRegex(
ValueError, "Sampling frames from a list of images is not supported! Set `do_sample_frames=False`"
):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
do_sample_frames=True,
)
def test_kwargs_overrides_custom_image_processor_kwargs(self):
processor = self.get_processor()
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 100)