Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
185 lines
7.7 KiB
Python
185 lines
7.7 KiB
Python
# Copyright 2026 HuggingFace Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import copy
|
|
import unittest
|
|
from inspect import signature
|
|
|
|
from .multimodal_tester import MultiModalModelTest, MultiModalModelTester
|
|
from .test_modeling_common import (
|
|
floats_tensor,
|
|
is_torch_available,
|
|
torch_device,
|
|
)
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
|
|
class VLMModelTester(MultiModalModelTester):
|
|
vision_config_class = None
|
|
_required_attributes = MultiModalModelTester._required_attributes + ("base_model_class", "vision_config_class")
|
|
|
|
@property
|
|
def pipeline_model_mapping(self):
|
|
return {
|
|
"feature-extraction": self.base_model_class,
|
|
"image-text-to-text": self.conditional_generation_class,
|
|
}
|
|
|
|
def __init__(self, parent, **kwargs):
|
|
# Overrides of _TEXT_MODEL_TESTER_DEFAULTS
|
|
kwargs.setdefault(
|
|
"seq_length",
|
|
7
|
|
+ kwargs.get(
|
|
"num_image_tokens",
|
|
(kwargs.get("image_size", 8) // kwargs.get("patch_size", 4)) ** 2,
|
|
),
|
|
)
|
|
kwargs.setdefault("pad_token_id", 0)
|
|
|
|
# VLM-specific defaults
|
|
kwargs.setdefault("use_token_type_ids", False)
|
|
kwargs.setdefault("hidden_dropout_prob", 0.1)
|
|
kwargs.setdefault("attention_probs_dropout_prob", 0.1)
|
|
kwargs.setdefault("type_vocab_size", 16)
|
|
kwargs.setdefault("type_sequence_label_size", 2)
|
|
kwargs.setdefault("initializer_range", 0.02)
|
|
kwargs.setdefault("num_labels", 3)
|
|
kwargs.setdefault("num_choices", 4)
|
|
kwargs.setdefault("image_token_id", 3)
|
|
kwargs.setdefault("is_decoder", False)
|
|
kwargs.setdefault("image_size", 8)
|
|
kwargs.setdefault("patch_size", 4)
|
|
kwargs.setdefault("num_channels", 3)
|
|
kwargs.setdefault("projection_dim", 32)
|
|
kwargs.setdefault("projector_hidden_act", "gelu")
|
|
kwargs.setdefault("vision_feature_select_strategy", "default")
|
|
kwargs.setdefault("vision_feature_layer", -1)
|
|
kwargs.setdefault("tie_word_embeddings", False)
|
|
kwargs.setdefault("num_image_tokens", (kwargs["image_size"] // kwargs["patch_size"]) ** 2)
|
|
|
|
super().__init__(parent, **kwargs)
|
|
|
|
# Computed default depending on base-class defaults for hidden_size / num_attention_heads.
|
|
if not hasattr(self, "head_dim"):
|
|
self.head_dim = self.hidden_size // self.num_attention_heads
|
|
|
|
# -- Overridable VLM-specific hooks ------------------------------------------------------
|
|
|
|
def create_pixel_values(self):
|
|
# Override to 5D for patch-based models
|
|
return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], scale=1.0)
|
|
|
|
def place_image_tokens(self, input_ids, config):
|
|
# Override if the image tokens shouldn't be placed at the start of the test sequence
|
|
image_token_id = getattr(config, "image_token_id", self.image_token_id)
|
|
# Clear any accidental image tokens first
|
|
input_ids = input_ids.clone()
|
|
input_ids[input_ids == image_token_id] = self.bos_token_id
|
|
# Place image tokens at the start
|
|
input_ids[:, : self.num_image_tokens] = image_token_id
|
|
return input_ids
|
|
|
|
# -- Hooks consumed by the shared base ---------------------------------------------------
|
|
|
|
@property
|
|
def _special_token_ids(self):
|
|
return super()._special_token_ids | {self.image_token_id}
|
|
|
|
def _build_modality_sub_configs(self):
|
|
return {"vision_config": self.get_vision_config()}
|
|
|
|
def _prepare_modality_inputs(self, input_ids, config):
|
|
pixel_values = self.create_pixel_values()
|
|
input_ids = self.place_image_tokens(input_ids, config)
|
|
return input_ids, {"pixel_values": pixel_values}
|
|
|
|
# -- Vision sub-config construction ------------------------------------------------------
|
|
|
|
@property
|
|
def vision_config_args(self):
|
|
return list(signature(self.vision_config_class.__init__).parameters.keys())
|
|
|
|
def get_vision_config(self):
|
|
kwargs = self._collect_kwargs(self.vision_config_args, self.vision_config_class)
|
|
return self.vision_config_class(**kwargs)
|
|
|
|
|
|
class VLMModelTest(MultiModalModelTest):
|
|
"""
|
|
Base test class for Vision-Language Models.
|
|
|
|
Subclasses should set:
|
|
- `model_tester_class`: The tester class (subclass of VLMModelTester)
|
|
|
|
Optional:
|
|
- `all_model_classes`: Override if not using default from model_tester
|
|
- `pipeline_model_mapping`: Override if not using default from model_tester
|
|
"""
|
|
|
|
def test_mismatching_num_image_tokens(self):
|
|
"""
|
|
Tests that VLMs throw an error with explicit message saying what is wrong
|
|
when number of images don't match number of image tokens in the text.
|
|
Also we need to test multi-image cases when one prompt has multiple image tokens.
|
|
"""
|
|
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
|
for model_class in self.all_model_classes:
|
|
model = model_class(config).to(torch_device)
|
|
model.eval()
|
|
curr_input_dict = copy.deepcopy(input_dict)
|
|
_ = model(**curr_input_dict) # successful forward with no modifications
|
|
|
|
# Test 1: remove one image but leave the image token in text
|
|
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
|
|
if "image_sizes" in curr_input_dict:
|
|
curr_input_dict["image_sizes"] = curr_input_dict["image_sizes"][-1:, ...]
|
|
with self.assertRaises(ValueError):
|
|
_ = model(**curr_input_dict)
|
|
|
|
# Test 2: simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
|
# First, take just the first item from each tensor
|
|
curr_input_dict = {key: val[:1] for key, val in curr_input_dict.items()}
|
|
|
|
# Double the batch size for all batch-dimension tensors except pixel_values
|
|
# This simulates having 2 prompts (each with image tokens) but only 1 image
|
|
batch_tensors_to_double = ["input_ids", "attention_mask", "token_type_ids"]
|
|
for key in batch_tensors_to_double:
|
|
if key in curr_input_dict and curr_input_dict[key] is not None:
|
|
curr_input_dict[key] = torch.cat([curr_input_dict[key], curr_input_dict[key]], dim=0)
|
|
|
|
# one image and two image tokens raise an error
|
|
with self.assertRaises(ValueError):
|
|
_ = model(**curr_input_dict)
|
|
|
|
# Test 3: two images and two image tokens don't raise an error
|
|
curr_input_dict["pixel_values"] = torch.cat(
|
|
[curr_input_dict["pixel_values"], curr_input_dict["pixel_values"]], dim=0
|
|
)
|
|
if "image_sizes" in curr_input_dict:
|
|
curr_input_dict["image_sizes"] = torch.cat(
|
|
[curr_input_dict["image_sizes"], curr_input_dict["image_sizes"]], dim=0
|
|
)
|
|
_ = model(**curr_input_dict)
|
|
|
|
@unittest.skip(
|
|
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. "
|
|
"Can be tested as part of LLM test"
|
|
)
|
|
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
|
|
pass
|