first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,362 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import itertools
import json
import tempfile
import unittest
import httpx
import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
class Qwen2VLImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_frames=10,
min_resolution=56,
max_resolution=1024,
min_pixels=56 * 56,
max_pixels=28 * 28 * 1280,
do_normalize=True,
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
do_resize=True,
patch_size=14,
temporal_patch_size=2,
merge_size=2,
do_convert_rgb=True,
):
self.parent = parent
self.batch_size = batch_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.num_channels = num_channels
self.num_frames = num_frames
self.image_mean = OPENAI_CLIP_MEAN
self.image_std = OPENAI_CLIP_STD
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.temporal_patch_size = temporal_patch_size
self.merge_size = merge_size
self.do_resize = do_resize
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"min_pixels": self.min_pixels,
"max_pixels": self.max_pixels,
"patch_size": self.patch_size,
"temporal_patch_size": self.temporal_patch_size,
"merge_size": self.merge_size,
}
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
images = prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
return [[image] for image in images]
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_video_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
num_frames=self.num_frames,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = Qwen2VLImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "patch_size"))
self.assertTrue(hasattr(image_processing, "temporal_patch_size"))
self.assertTrue(hasattr(image_processing, "merge_size"))
def test_image_processor_to_json_string(self):
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class(**self.image_processor_dict)
obj = json.loads(image_processor.to_json_string())
for key, value in self.image_processor_dict.items():
if key not in ["min_pixels", "max_pixels"]:
self.assertEqual(obj[key], value)
def test_select_best_resolution(self):
# Test with a final resize resolution
best_resolution = smart_resize(561, 278, factor=28)
self.assertEqual(best_resolution, (560, 280))
def test_call_pil(self):
for image_processing_class in self.image_processing_classes.values():
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image in image_inputs:
self.assertIsInstance(image[0], Image.Image)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
def test_call_numpy(self):
for image_processing_class in self.image_processing_classes.values():
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for image in image_inputs:
self.assertIsInstance(image[0], np.ndarray)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
def test_call_pytorch(self):
for image_processing_class in self.image_processing_classes.values():
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for image in image_inputs:
self.assertIsInstance(image[0], torch.Tensor)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (4900, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
@unittest.skip(reason="Qwen2VLImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
def test_call_numpy_4_channels(self):
pass
def test_nested_input(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
# Test batched as a list of images
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched as a nested list of images, where each sublist is one batch
image_inputs_nested = image_inputs[:3] + image_inputs[3:]
process_out = image_processing(image_inputs_nested, return_tensors="pt")
encoded_images_nested = process_out.pixel_values
image_grid_thws_nested = process_out.image_grid_thw
expected_output_image_shape = (34300, 1176)
expected_image_grid_thws = torch.Tensor([[1, 70, 70]] * 7)
self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Image processor should return same pixel values, independently of ipnut format
self.assertTrue((encoded_images_nested == encoded_images).all())
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
def test_custom_image_size(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
image_processing.save_pretrained(tmpdirname)
image_processor_loaded = image_processing_class.from_pretrained(
tmpdirname, max_pixels=56 * 56, min_pixels=28 * 28
)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
process_out = image_processor_loaded(image_inputs, return_tensors="pt")
expected_output_video_shape = [112, 1176]
self.assertListEqual(list(process_out.pixel_values.shape), expected_output_video_shape)
def test_custom_pixels(self):
pixel_choices = frozenset(itertools.product((100, 150, 200, 20000), (100, 150, 200, 20000)))
for image_processing_class in self.image_processing_classes.values():
image_processor_dict = self.image_processor_dict.copy()
for a_pixels, b_pixels in pixel_choices:
image_processor_dict["min_pixels"] = min(a_pixels, b_pixels)
image_processor_dict["max_pixels"] = max(a_pixels, b_pixels)
image_processor = image_processing_class(**image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs()
# Just checking that it doesn't raise an error
image_processor(image_inputs, return_tensors="pt")
@require_vision
@require_torch
def test_backends_equivalence(self):
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
dummy_image = Image.open(
io.BytesIO(
httpx.get("http://images.cocodataset.org/val2017/000000039769.jpg", follow_redirects=True).content
)
)
# Create processors for each backend
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_image, return_tensors="pt")
# Compare all backends to the first one (reference backend)
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
reference_encoding = encodings[reference_backend]
for backend_name in backend_names[1:]:
self._assert_tensors_equivalence(reference_encoding.pixel_values, encodings[backend_name].pixel_values)
self.assertEqual(reference_encoding.image_grid_thw.dtype, encodings[backend_name].image_grid_thw.dtype)
self._assert_tensors_equivalence(
reference_encoding.image_grid_thw.float(), encodings[backend_name].image_grid_thw.float()
)
@require_vision
@require_torch
def test_backends_equivalence_batched(self):
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
# Create processors for each backend
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_images, return_tensors="pt")
# Compare all backends to the first one (reference backend)
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
reference_encoding = encodings[reference_backend]
for backend_name in backend_names[1:]:
self._assert_tensors_equivalence(reference_encoding.pixel_values, encodings[backend_name].pixel_values)
self.assertEqual(reference_encoding.image_grid_thw.dtype, encodings[backend_name].image_grid_thw.dtype)
self._assert_tensors_equivalence(
reference_encoding.image_grid_thw.float(), encodings[backend_name].image_grid_thw.float()
)
def test_get_num_patches_without_images(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
self.assertEqual(num_patches, 64)
num_patches = image_processing.get_number_of_image_patches(height=200, width=50, images_kwargs={})
self.assertEqual(num_patches, 56)
num_patches = image_processing.get_number_of_image_patches(
height=100, width=100, images_kwargs={"patch_size": 28}
)
self.assertEqual(num_patches, 16)

View File

@@ -0,0 +1,721 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Qwen2-VL model."""
import copy
import gc
import tempfile
import unittest
import pytest
import requests
from transformers import (
AutoProcessor,
Qwen2VLConfig,
Qwen2VLForConditionalGeneration,
Qwen2VLModel,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import (
Expectations,
backend_empty_cache,
require_flash_attn,
require_torch,
require_torch_accelerator,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
floats_tensor,
ids_tensor,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
class Qwen2VLVisionText2TextModelTester:
def __init__(
self,
parent,
batch_size=3,
seq_length=7,
num_channels=3,
ignore_index=-100,
image_size=14,
text_config={
"bos_token_id": 0,
"eos_token_id": 1,
"pad_token_id": 2,
"hidden_act": "silu",
"hidden_size": 32,
"vocab_size": 99,
"intermediate_size": 37,
"max_position_embeddings": 512,
"max_window_layers": 3,
"num_attention_heads": 4,
"num_hidden_layers": 2,
"num_key_value_heads": 2,
"rope_theta": 10000,
"tie_word_embeddings": True,
"rope_parameters": {"type": "mrope", "mrope_section": [2, 1, 1]},
},
vision_start_token_id=3,
image_token_id=4,
video_token_id=5,
is_training=True,
vision_config={
"depth": 2,
"embed_dim": 32,
"hidden_act": "quick_gelu",
"hidden_size": 32,
"mlp_ratio": 4,
"num_heads": 4,
"patch_size": 14,
"spatial_merge_size": 1,
"temporal_patch_size": 2,
},
):
self.parent = parent
self.ignore_index = ignore_index
self.bos_token_id = text_config["bos_token_id"]
self.eos_token_id = text_config["eos_token_id"]
self.pad_token_id = text_config["pad_token_id"]
self.num_hidden_layers = text_config["num_hidden_layers"]
self.num_attention_heads = text_config["num_attention_heads"]
self.hidden_size = text_config["hidden_size"]
self.vision_start_token_id = vision_start_token_id
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.text_config = text_config
self.vision_config = vision_config
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.is_training = is_training
self.vocab_size = text_config["vocab_size"]
self.num_image_tokens = 32
self.seq_length = seq_length + self.num_image_tokens
def get_config(self):
return Qwen2VLConfig(
text_config=self.text_config,
vision_config=self.vision_config,
vision_start_token_id=self.vision_start_token_id,
image_token_id=self.image_token_id,
video_token_id=self.video_token_id,
)
def prepare_config_and_inputs(self):
config = self.get_config()
patch_size = config.vision_config.patch_size
temporal_patch_size = config.vision_config.temporal_patch_size
pixel_values = floats_tensor(
[
self.batch_size * (self.image_size**2) // (patch_size**2),
self.num_channels * (patch_size**2) * temporal_patch_size,
]
)
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
input_ids[:, -1] = self.pad_token_id
attention_mask[:, -1] = 0
input_ids[input_ids == self.video_token_id] = self.pad_token_id
input_ids[input_ids == self.image_token_id] = self.pad_token_id
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
input_ids[:, self.num_image_tokens] = self.image_token_id
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
mm_token_type_ids = torch.zeros_like(input_ids)
mm_token_type_ids[:, self.num_image_tokens] = 1
inputs_dict = {
"pixel_values": pixel_values,
"image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size, device=torch_device),
"input_ids": input_ids,
"attention_mask": attention_mask,
"mm_token_type_ids": mm_token_type_ids,
}
return config, inputs_dict
@require_torch
class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Model tester for `Qwen2VLForConditionalGeneration`.
"""
all_model_classes = (
(
Qwen2VLModel,
Qwen2VLForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {
"image-text-to-text": Qwen2VLForConditionalGeneration,
"any-to-any": Qwen2VLForConditionalGeneration,
}
_is_composite = True
def setUp(self):
self.model_tester = Qwen2VLVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False)
def test_config(self):
self.config_tester.run_common_tests()
def test_mismatching_num_image_tokens(self):
"""
Tests that VLMs through an error with explicit message saying what is wrong
when number of images don't match number of image tokens in the text.
Also we need to test multi-image cases when one prompt has multiple image tokens.
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
model.eval()
curr_input_dict = copy.deepcopy(input_dict)
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
patch_size = config.vision_config.patch_size
one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-one_img_length:, ...]
curr_input_dict["image_grid_thw"] = curr_input_dict["image_grid_thw"][-1:, ...]
with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"):
_ = model(**curr_input_dict)
model.base_model.rope_deltas = None
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = curr_input_dict["input_ids"][:1]
mm_token_type_ids = curr_input_dict["mm_token_type_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:one_img_length]
image_grid_thw = curr_input_dict["image_grid_thw"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
mm_token_type_ids = torch.cat([mm_token_type_ids, mm_token_type_ids], dim=0)
with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"):
_ = model(
input_ids=input_ids,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw,
mm_token_type_ids=mm_token_type_ids,
)
model.base_model.rope_deltas = None
# two images and two image tokens don't raise an error
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
image_grid_thw = torch.cat([image_grid_thw, image_grid_thw], dim=0)
_ = model(
input_ids=input_ids,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw,
mm_token_type_ids=mm_token_type_ids,
)
def test_forward_with_rope_deltas_cached(self):
"""
Tests that Qwen2-VL computes new rope deltas every forward pass with new set of inputs.
Rope deltas are cached when we generate and re-used for decoding phase, byt are not reset
automatically after generation ends. See https://github.com/huggingface/transformers/pull/36013 for more
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_generative_model_classes:
model = model_class(config).to(torch_device)
# Generate and make sure rope_deltas are not `None`
self.assertTrue(model.model.rope_deltas is None)
generation_output = model.generate(
**input_dict, max_new_tokens=4, return_dict_in_generate=True, output_logits=True
)
self.assertTrue(model.model.rope_deltas is not None)
# Now if we try to do forward pass, we should get new rope logits, because cache is not passed
forward_output = model(**input_dict)
torch.testing.assert_close(
generation_output.logits[0], forward_output.logits[:, -1, :], rtol=1e-4, atol=1e-4
)
# Same happens if we call `generate` API instead of `forward`
generation_output_second = model.generate(
**input_dict, max_new_tokens=10, return_dict_in_generate=True, output_logits=True
)
torch.testing.assert_close(
generation_output.logits[0], generation_output_second.logits[0], rtol=1e-4, atol=1e-4
)
def test_vision_position_ids(self):
"""
Tests that vision position ids are built correctly for images and for videos.
See https://github.com/huggingface/transformers/pull/45400
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = Qwen2VLModel(config).to(torch_device)
batch_size = input_dict["input_ids"].shape[0]
# Test most simple case when num_image_tokens == 1. Position ids will be sunsequent and text-like
position_ids = model.get_rope_index(
input_dict["input_ids"], input_dict["mm_token_type_ids"], input_dict["image_grid_thw"]
)[0]
expected_positions = torch.arange(39)[None, None, :].repeat(3, batch_size, 1)
self.assertListEqual(list(position_ids.shape), [3, batch_size, 39])
self.assertListEqual(position_ids.tolist(), expected_positions.tolist())
# Each image encodes to more than 1 token (i.e. 4 height and 3 width patches = 12 tokens)
image_token_id = config.image_token_id
pad_token_id = config.text_config.pad_token_id
input_ids = torch.tensor([[pad_token_id] + [image_token_id] * 12 + [pad_token_id]], device=torch_device)
mm_token_type_ids = torch.tensor([[0] + [1] * 12 + [0]], device=torch_device)
image_grid_thw = torch.tensor([[1, 4, 3]], device=torch_device)
position_ids = model.get_rope_index(input_ids, mm_token_type_ids, image_grid_thw)[0]
expected_positions = torch.tensor(
[
[[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5]],
[[0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5]],
[[0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 5]],
]
)
self.assertListEqual(list(position_ids.shape), [3, 1, 14])
self.assertListEqual(position_ids.tolist(), expected_positions.tolist())
# Check video position ids with 2 frames, and 4 height, 3 width patches (= 12 * 2 tokens)
video_token_id = config.video_token_id
input_ids = torch.tensor([[pad_token_id] + [video_token_id] * 24 + [pad_token_id]], device=torch_device)
mm_token_type_ids = torch.tensor([[0] + [2] * 24 + [0]], device=torch_device)
video_grid_thw = torch.tensor([[2, 4, 3]], device=torch_device)
position_ids = model.get_rope_index(input_ids, mm_token_type_ids, video_grid_thw=video_grid_thw)[0]
expected_positions = torch.tensor(
[
[[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5]],
[[0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5]],
[[0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 5]],
]
)
self.assertListEqual(list(position_ids.shape), [3, 1, 26])
self.assertListEqual(position_ids.tolist(), expected_positions.tolist())
def attention_mask_padding_matches_padding_free_with_position_ids(
self, attn_implementation: str, fa_kwargs: bool = False
):
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.float16]:
dummy_input = dummy_input.to(torch.bfloat16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
if 0 in inputs_dict["attention_mask"][:, -1]:
inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1)
dummy_attention_mask = inputs_dict["attention_mask"]
inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.get_text_config().pad_token_id
model = (
model_class.from_pretrained(
tmpdirname,
dtype=torch.bfloat16,
attn_implementation=attn_implementation,
)
.to(torch_device)
.eval()
)
# flatten
padfree_inputs_dict = {
"pixel_values": inputs_dict["pixel_values"],
"image_grid_thw": inputs_dict["image_grid_thw"],
"input_ids": inputs_dict["input_ids"][dummy_attention_mask.bool()].unsqueeze(0),
}
# add position_ids
vision_position_ids, deltas = model.model.get_rope_index(
input_ids=inputs_dict["input_ids"],
image_grid_thw=inputs_dict["image_grid_thw"],
attention_mask=inputs_dict["attention_mask"],
mm_token_type_ids=inputs_dict["mm_token_type_ids"],
) # [3, bs, padded-seq-len]
vision_padfree_positions = vision_position_ids[:, dummy_attention_mask.bool()].view(
3, -1
) # [3, bs*padfree-len]
text_padfree_positions = torch.cat(
[torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()]
) # [1, bs*padfree-len]
text_padfree_positions = text_padfree_positions.long().unsqueeze(0).to(torch_device)
padfree_inputs_dict["position_ids"] = torch.cat([text_padfree_positions, vision_padfree_positions])[
:, None, :
]
if fa_kwargs:
cu_seq_lens = [0] + dummy_attention_mask.sum(1).tolist()
cu_seq_lens = torch.tensor(cu_seq_lens, device=torch_device)
max_length = cu_seq_lens.diff().max().item()
padfree_inputs_dict.update(
{
"cu_seq_lens_q": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"cu_seq_lens_k": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"max_length_q": max_length,
"max_length_k": max_length,
}
)
# We need to do simple forward without cache in roder to trigger packed SDPA/FLEX/EAGER path
res_padded = model(**inputs_dict, use_cache=False)
res_padfree = model(**padfree_inputs_dict, use_cache=False)
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]
# acceptable numerical instability
tol = torch.finfo(torch.bfloat16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
@unittest.skip(reason="Feedforward chunking is not yet supported")
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="CPU offload is not yet supported")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_disk_offload_bin(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_disk_offload_safetensors(self):
pass
@unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
def test_model_parallelism(self):
pass
@unittest.skip(reason="Compile not yet supported because in Qwen2VL models")
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
def test_multi_gpu_data_parallel_forward(self):
pass
def test_enable_input_require_grads_with_gradient_checkpointing(self):
if not self.model_tester.is_training:
self.skipTest(reason="ModelTester not in training mode")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.use_cache = False
config.return_dict = True
for model_class in self.all_model_classes:
if not model_class.supports_gradient_checkpointing:
continue
model = model_class(config)
model.to(torch_device)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
model.enable_input_require_grads()
model.train()
for parameter in model.parameters():
parameter.requires_grad = False
vision_module = None
if hasattr(model, "visual"):
vision_module = model.visual
elif hasattr(model, "model") and hasattr(model.model, "visual"):
vision_module = model.model.visual
if vision_module is None:
continue
target_linear = vision_module.blocks[0].attn.qkv
target_linear.weight.requires_grad = True
if target_linear.bias is not None:
target_linear.bias.requires_grad = True
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
outputs = model(**inputs)
if hasattr(outputs, "loss") and outputs.loss is not None:
loss = outputs.loss
else:
logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
loss = logits.sum()
loss.backward()
self.assertIsNotNone(
target_linear.weight.grad,
f"qkv weights should receive gradients when enable_input_require_grads is used with gradient checkpointing. Model: {model_class.__name__}",
)
self.assertGreater(
target_linear.weight.grad.abs().sum().item(),
0,
f"qkv weights should have non-zero gradients when enable_input_require_grads is used with gradient checkpointing. Model: {model_class.__name__}",
)
@require_torch
class Qwen2VLIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
self.messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What kind of dog is this?"},
],
}
]
url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
self.image = Image.open(requests.get(url, stream=True).raw)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
@slow
def test_small_model_integration_test(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt")
expected_input_ids = [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151655] # fmt: skip
assert expected_input_ids == inputs.input_ids[0].tolist()[:17]
expected_pixel_slice = torch.tensor(
[
[0.8792, 0.8792, 0.9084],
[1.1858, 1.1858, 1.2296],
[1.2004, 1.2004, 1.2150],
[1.4340, 1.4340, 1.4194],
[1.3902, 1.4048, 1.4194],
[1.5216, 1.5362, 1.5362],
],
dtype=torch.float32,
device="cpu",
)
assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
# verify generation
inputs = inputs.to(torch_device)
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices"
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_expand(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
output = model.generate(**inputs, max_new_tokens=30, num_return_sequences=3)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch_wo_image(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
messages2 = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who are you?"},
]
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am a large language model created by Alibaba Cloud. I am called Qwen.'
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
def test_small_model_integration_test_batch_different_resolutions(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", dtype="auto", device_map="auto"
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
text2 = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
image2 = self.image.resize((224, 224))
inputs = self.processor(text=[text, text2], images=[self.image, image2], padding=True, return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_DECODED_TEXTS = Expectations(
{
("xpu", 3): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
],
("cuda", None): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets',
],
("cuda", 8): [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices'
],
}
) # fmt: skip
EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
self.assertEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT)
@slow
@require_flash_attn
@require_torch_accelerator
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
"system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices",
]
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
@slow
@require_flash_attn
@require_torch_accelerator
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
messages2 = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who are you?"},
]
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=30)
EXPECTED_DECODED_TEXT = [
'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am a large language model created by Alibaba Cloud. I am called Qwen.'
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)

View File

@@ -0,0 +1,300 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
if is_vision_available():
from transformers import Qwen2VLProcessor
if is_torchvision_available():
pass
if is_torch_available():
import torch
@require_vision
@require_torch
@require_torchvision
class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Qwen2VLProcessor
model_id = "Qwen/Qwen2-VL-7B-Instruct"
@classmethod
def _setup_from_pretrained(cls, model_id, **kwargs):
return super()._setup_from_pretrained(model_id, patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28, **kwargs)
@classmethod
def _setup_test_attributes(cls, processor):
cls.image_token = processor.image_token
def test_get_num_vision_tokens(self):
"Tests general functionality of the helper used internally in vLLM"
processor = self.get_processor()
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
self.assertTrue("num_image_tokens" in output)
self.assertEqual(len(output["num_image_tokens"]), 3)
self.assertTrue("num_image_patches" in output)
self.assertEqual(len(output["num_image_patches"]), 3)
@require_torch
@require_av
def _test_apply_chat_template(
self,
modality: str,
batch_size: int,
return_tensors: str,
input_name: str,
processor_name: str,
input_data: list[str],
):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
if processor_name not in self.processor_class.get_attributes():
self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
batch_messages = [
[
{
"role": "user",
"content": [{"type": "text", "text": "Describe this."}],
},
]
] * batch_size
# Test that jinja can be applied
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), batch_size)
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
formatted_prompt_tokenized = processor.apply_chat_template(
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
)
add_special_tokens = True
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
add_special_tokens = False
tok_output = processor.tokenizer(
formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
)
expected_output = tok_output.input_ids
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
# Test that kwargs passed to processor's `__call__` are actually used
tokenized_prompt_100 = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
padding="max_length",
truncation=True,
return_tensors=return_tensors,
max_length=100,
)
self.assertEqual(len(tokenized_prompt_100[0]), 100)
# Test that `return_dict=True` returns text related inputs in the dict
out_dict_text = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
)
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
for idx, url in enumerate(input_data[:batch_size]):
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
out_dict = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
if modality == "video":
# qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
expected_video_token_count = 0
for thw in out_dict["video_grid_thw"]:
expected_video_token_count += thw[0] * thw[1] * thw[2]
mm_len = expected_video_token_count
else:
mm_len = batch_size * 192
self.assertEqual(len(out_dict[input_name]), mm_len)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
for k in out_dict:
self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors])
@require_av
def test_apply_chat_template_video_frame_sampling(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
if "video_processor" not in self.processor_class.get_attributes():
self.skipTest("Processor doesn't accept videos at input")
messages = [
[
{
"role": "user",
"content": [
{
"type": "video",
"url": url_to_local_path(
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
),
},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
num_frames = 3
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
num_frames=num_frames,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
# Load with `fps` arg
fps = 1
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 360)
# Load with `fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
num_frames=num_frames,
)
# Load without any arg should load the whole video
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1080)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video
messages[0][0]["content"][0] = {
"type": "video",
"url": [
url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
],
}
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)
# When the inputs are frame URLs/paths we expect that those are already
# sampled and will raise an error is asked to sample again.
with self.assertRaisesRegex(
ValueError, "Sampling frames from a list of images is not supported! Set `do_sample_frames=False`"
):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
do_sample_frames=True,
)
def test_kwargs_overrides_custom_image_processor_kwargs(self):
processor = self.get_processor()
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=20,
)

View File

@@ -0,0 +1,395 @@
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import tempfile
import unittest
import numpy as np
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers.image_utils import get_image_size
from transformers.models.qwen2_vl.video_processing_qwen2_vl import smart_resize
if is_torchvision_available():
from transformers import Qwen2VLVideoProcessor
class Qwen2VLVideoProcessingTester:
def __init__(
self,
parent,
batch_size=5,
num_frames=8,
num_channels=3,
min_resolution=30,
max_resolution=80,
do_resize=True,
size=None,
do_normalize=True,
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
do_convert_rgb=True,
temporal_patch_size=2,
patch_size=14,
min_pixels=20 * 20,
max_pixels=100 * 100,
merge_size=2,
):
size = size if size is not None else {"shortest_edge": 400, "longest_edge": 10000}
self.parent = parent
self.batch_size = batch_size
self.num_frames = num_frames
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
self.temporal_patch_size = temporal_patch_size
self.patch_size = patch_size
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.merge_size = merge_size
def prepare_video_processor_dict(self):
return {
"do_resize": self.do_resize,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
"temporal_patch_size": self.temporal_patch_size,
"patch_size": self.patch_size,
"min_pixels": self.min_pixels,
"max_pixels": self.max_pixels,
"merge_size": self.merge_size,
}
@require_vision
def expected_output_video_shape(self, videos, num_frames=None):
num_frames = num_frames if num_frames is not None else self.num_frames
grid_t = num_frames // self.temporal_patch_size
hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
seq_len = 0
for video in videos:
if isinstance(video[0], Image.Image):
video = np.stack([np.array(frame) for frame in video])
height, width = get_image_size(video)
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
seq_len += grid_t * grid_h * grid_w
return [seq_len, hidden_dim]
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
videos = prepare_video_inputs(
batch_size=self.batch_size,
num_frames=self.num_frames,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
return_tensors=return_tensors,
)
return videos
@require_torch
@require_vision
class Qwen2VLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
fast_video_processing_class = Qwen2VLVideoProcessor if is_torchvision_available() else None
def setUp(self):
super().setUp()
self.video_processor_tester = Qwen2VLVideoProcessingTester(self)
@property
def video_processor_dict(self):
return self.video_processor_tester.prepare_video_processor_dict()
def test_video_processor_properties(self):
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
self.assertTrue(hasattr(video_processing, "do_resize"))
self.assertTrue(hasattr(video_processing, "size"))
self.assertTrue(hasattr(video_processing, "do_normalize"))
self.assertTrue(hasattr(video_processing, "image_mean"))
self.assertTrue(hasattr(video_processing, "image_std"))
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
def test_video_processor_from_dict_with_kwargs(self):
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
self.assertEqual(video_processor.size, {"shortest_edge": 400, "longest_edge": 10000})
video_processor = self.fast_video_processing_class.from_dict(
self.video_processor_dict, size={"shortest_edge": 100, "longest_edge": 200}
)
# min_pixels and max_pixels take precedence over size, like in the image processor.
self.assertEqual(video_processor.size, {"shortest_edge": 400, "longest_edge": 10000})
processor_dict = self.video_processor_dict.copy()
processor_dict.pop("min_pixels")
processor_dict.pop("max_pixels")
video_processor = self.fast_video_processing_class.from_dict(
processor_dict, size={"shortest_edge": 100, "longest_edge": 200}
)
self.assertEqual(video_processor.size, {"shortest_edge": 100, "longest_edge": 200})
def test_video_processor_to_json_string(self):
for video_processing_class in self.video_processor_list:
video_processor = video_processing_class(**self.video_processor_dict)
obj = json.loads(video_processor.to_json_string())
for key, value in self.video_processor_dict.items():
if key not in ["min_pixels", "max_pixels"]:
self.assertEqual(obj[key], value)
def test_call_pil(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="pil"
)
# Each video is a list of PIL Images
for video in video_inputs:
self.assertIsInstance(video[0], Image.Image)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_numpy(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
# create random numpy tensors
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
for video in video_inputs:
self.assertIsInstance(video, np.ndarray)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_pytorch(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
# create random PyTorch tensors
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="torch"
)
for video in video_inputs:
self.assertIsInstance(video, torch.Tensor)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
self.assertEqual(
list(encoded_videos.shape),
expected_output_video_shape,
)
def test_nested_input(self):
"""Tests that the processor can work with nested list where each video is a list of arrays"""
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
# Test not batched input
video_inputs_nested = [list(video) for video in video_inputs]
encoded_videos = video_processing(video_inputs_nested[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
encoded_videos = video_processing(video_inputs_nested, return_tensors="pt")[self.input_name]
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@unittest.skip("Skip for now, the test needs adjustment fo Qwen2VL")
def test_call_numpy_4_channels(self):
for video_processing_class in self.video_processor_list:
# Test that can process videos which have an arbitrary number of channels
# Initialize video_processing
video_processor = video_processing_class(**self.video_processor_dict)
# create random numpy tensors
self.video_processor_tester.num_channels = 4
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
# Test not batched input
encoded_videos = video_processor(
video_inputs[0],
return_tensors="pt",
input_data_format="channels_last",
image_mean=(0.0, 0.0, 0.0, 0.0),
image_std=(1.0, 1.0, 1.0, 1.0),
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processor(
video_inputs,
return_tensors="pt",
input_data_format="channels_last",
image_mean=(0.0, 0.0, 0.0, 0.0),
image_std=(1.0, 1.0, 1.0, 1.0),
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_sample_frames(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
prev_num_frames = self.video_processor_tester.num_frames
self.video_processor_tester.num_frames = 8
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False,
return_tensors="torch",
)
# Force set sampling to False. No sampling is expected even when `num_frames` exists
video_processing.do_sample_frames = False
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
video_processing.do_sample_frames = True
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=4)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=4)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
[video_inputs[0]], num_frames=4
)
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
video_inputs, num_frames=4
)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
batched_metadata = metadata * len(video_inputs)
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
self.input_name
]
encoded_videos_batched = video_processing(
video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
[video_inputs[0]], num_frames=6
)
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
video_inputs, num_frames=6
)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# We should raise error when asked to sample more frames than there are in input video
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
self.input_name
]
# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames
def test_num_frames_equal_temporal_patch_size_plus_two(self):
for video_processing_class in self.video_processor_list:
video_processor_dict = self.video_processor_dict.copy()
video_processor_dict["size"] = {"longest_edge": 5 * 28 * 28, "shortest_edge": 28 * 28}
video_processor_dict["do_sample_frames"] = False
temporal_patch_size = 3
video_processor_dict["temporal_patch_size"] = temporal_patch_size
video_processing = video_processing_class(**video_processor_dict)
n, w, h = 5, 28, 28
video_inputs = [(np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)) for _ in range(n)]
video_processed = video_processing(video_inputs, return_tensors="pt")
encoded_videos = video_processed[self.input_name]
self.assertEqual(list(encoded_videos.shape), [8, temporal_patch_size * 3 * 14 * 14])
video_grid_thw = video_processed["video_grid_thw"]
self.assertEqual(video_grid_thw.tolist(), [[2, 2, 2]])
def test_bc_min_max_pixels(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
video_processing.save_pretrained(tmpdirname)
video_processing_loaded = video_processing_class.from_pretrained(
tmpdirname, max_pixels=56 * 56, min_pixels=28 * 28
)
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=True,
return_tensors="torch",
)
processed = video_processing_loaded(video_inputs, return_tensors="pt")
expected_output_video_shape = [320, 1176]
self.assertListEqual(list(processed.pixel_values_videos.shape), expected_output_video_shape)