first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/multimodal_tester.py
+++ b/tests/multimodal_tester.py
@@ -0,0 +1,254 @@
+# Copyright 2026 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from inspect import signature
+
+from transformers.testing_utils import _TEXT_MODEL_TESTER_DEFAULTS
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import (
+    GenerationTesterMixin,
+    ModelTesterMixin,
+    ids_tensor,
+    is_torch_available,
+    require_torch,
+    torch_device,
+)
+from .test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+class MultiModalModelTester:
+    """Shared tester base for VLM (vision-language) and ALM (audio-language) models.
+
+    Concrete subclasses (e.g. `VLMModelTester`, `ALMModelTester`) supply:
+      - the modality-specific sub-config class (`vision_config_class` for VLMs, `audio_config_class` for ALMs, ...),
+      - the modality-specific defaults and helper methods,
+      - the hooks `_build_modality_sub_configs` and `_prepare_modality_inputs`,
+      - optionally an extended `_special_token_ids` and `pipeline_model_mapping`.
+
+    This tester provides shared logic for evaluating and verifying models that combine text with other modalities,
+    centering on the needs of vision-language (VLM) and audio-language (ALM) models.
+    """
+
+    # If the model follows the standard naming conventions, only `base_model_class` needs to be set
+    # (the others are inferred from available public classes).
+    base_model_class = None
+    config_class = None
+    text_config_class = None
+    conditional_generation_class = None
+    sequence_classification_class = None
+
+    # Required attributes after the initialization phase of the tester. Subclasses extend.
+    _required_attributes = ("config_class", "text_config_class", "conditional_generation_class")
+
+    # Arguments that should be passed to the config class even if not in its signature
+    forced_config_args = ["pad_token_id"]
+
+    @property
+    def all_model_classes(self):
+        # Models that set `all_model_classes` in their `XXXModelTest` class must have a new class that doesn't fit
+        # any of the common classes.
+        return [
+            model_class
+            for model_class in (
+                self.base_model_class,
+                self.conditional_generation_class,
+                self.sequence_classification_class,
+            )
+            if model_class is not None
+        ]
+
+    def __init__(self, parent, **kwargs):
+        self.parent = parent
+
+        # Multimodal-specific overrides of shared defaults (applied before the shared
+        # defaults so they take precedence, but after any subclass setdefault calls).
+        kwargs.setdefault("batch_size", 3)
+        kwargs.setdefault("moe_intermediate_size", 12)
+
+        # Apply shared text-model defaults for anything not already set.
+        # Subclasses are expected to `setdefault` their modality-specific kwargs
+        # (and any differing values such as `pad_token_id`) *before* calling super.
+        for key, default in _TEXT_MODEL_TESTER_DEFAULTS.items():
+            kwargs.setdefault(key, default)
+
+        kwargs.setdefault("ignore_index", -100)
+        kwargs.setdefault("scope", None)
+
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+        self._check_required_attributes()
+
+    def _check_required_attributes(self):
+        for required_attribute in self._required_attributes:
+            if getattr(self, required_attribute, None) is None:
+                raise ValueError(
+                    f"You have inherited from {type(self).__name__} but did not set the {required_attribute} attribute."
+                )
+
+    # -- Overridable modality hooks -----------------------------------------------------------
+
+    def create_attention_mask(self, input_ids):
+        """Default causal (lower-triangular) attention mask. Override for bidirectional models like Gemma3."""
+        return torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+    def get_additional_inputs(self, config, input_ids, modality_inputs):
+        """Model-specific extra inputs (e.g. LlavaNext `image_sizes`, Qwen3VL `mm_token_type_ids`).
+
+        ``modality_inputs`` is the full dict returned by ``_prepare_modality_inputs``.
+        """
+        return {}
+
+    @property
+    def _special_token_ids(self):
+        """Special token ids that must never appear as random text tokens. Subclasses add modality tokens."""
+        return {self.pad_token_id, self.bos_token_id, self.eos_token_id}
+
+    def _build_modality_sub_configs(self):
+        """Return the {sub-config-key: sub-config-instance} entries for the main config constructor."""
+        raise NotImplementedError
+
+    def _prepare_modality_inputs(self, input_ids, config):
+        """Create modality features, place modality placeholder tokens in ``input_ids``, and return:
+
+        (input_ids_with_placeholders, modality_inputs_dict)
+        """
+        raise NotImplementedError
+
+    # -- End of overridable hooks -------------------------------------------------------------
+
+    def _safe_token_id(self):
+        """Smallest token ID that is not a special token. Used to scrub random ids_tensor outputs."""
+        special_tokens = self._special_token_ids
+        for i in range(self.vocab_size):
+            if i not in special_tokens:
+                return i
+        raise ValueError("vocab_size is too small and there is no token ID that is not a special token!")
+
+    def prepare_config_and_inputs_for_common(self):
+        config = self.get_config()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        # Avoid flaky tests by scrubbing any accidental special tokens produced by ids_tensor.
+        # Modality placeholder tokens are scrubbed and placed by `_prepare_modality_inputs`.
+        safe_token_id = self._safe_token_id()
+        for token_id in self._special_token_ids:
+            input_ids[input_ids == token_id] = safe_token_id
+
+        input_ids, modality_inputs = self._prepare_modality_inputs(input_ids, config)
+
+        # Create attention mask with final input_ids (after modality placeholders are placed) — important
+        # for models that derive padding from token values.
+        attention_mask = self.create_attention_mask(input_ids) if self.use_input_mask else None
+
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        inputs_dict.update(modality_inputs)
+        inputs_dict.update(self.get_additional_inputs(config, input_ids, modality_inputs))
+        return config, inputs_dict
+
+    # -- Config construction helpers ----------------------------------------------------------
+
+    @property
+    def config_args(self):
+        return list(signature(self.config_class.__init__).parameters.keys())
+
+    @property
+    def text_config_args(self):
+        args = list(signature(self.text_config_class.__init__).parameters.keys())
+        for token_arg in ["pad_token_id", "bos_token_id", "eos_token_id"]:  # Not always explicitly in the sig
+            if token_arg not in args:
+                args.append(token_arg)
+        return args
+
+    def _collect_kwargs(self, sig_keys, config_class):
+        """Collect kwargs for ``config_class`` by matching ``sig_keys`` (and its ``attribute_map``) against ``self``."""
+        attribute_map = getattr(config_class, "attribute_map", {})
+        model_name_to_common_name = {v: k for k, v in attribute_map.items()}
+        kwargs = {}
+        for k in sig_keys:
+            if hasattr(self, k) and k != "self":
+                kwargs[k] = getattr(self, k)
+            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
+                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        return kwargs
+
+    def get_config(self):
+        kwargs = self._collect_kwargs(self.config_args + self.forced_config_args, self.config_class)
+        kwargs["text_config"] = self.get_text_config()
+        kwargs.update(self._build_modality_sub_configs())
+        return self.config_class(**kwargs)
+
+    def get_text_config(self):
+        kwargs = self._collect_kwargs(self.text_config_args, self.text_config_class)
+        return self.text_config_class(**kwargs)
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = self.base_model_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+
+@require_torch
+class MultiModalModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin):
+    """Shared test-class base for multimodal model families.
+
+    Subclasses must set:
+      - ``model_tester_class``: The tester class (subclass of ``MultiModalModelTester``)
+
+    Optional:
+      - ``all_model_classes``: override if not using the default from the model tester
+      - ``pipeline_model_mapping``: override if not using the default from the model tester
+    """
+
+    model_tester_class = None
+    all_model_classes = None
+    pipeline_model_mapping = None
+
+    # Multimodal models are always composite
+    _is_composite = True
+
+    def setUp(self):
+        if self.model_tester_class is None:
+            raise ValueError(
+                f"You have inherited from {type(self).__name__} but did not set the model_tester_class attribute."
+            )
+        self.model_tester = self.model_tester_class(self)
+        self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, has_text_modality=False)
+
+        if self.pipeline_model_mapping is None:
+            if self.all_model_classes is not None:
+                raise ValueError(
+                    f"Tests that inherit from `{type(self).__name__}` and set `all_model_classes` must manually set "
+                    "`pipeline_model_mapping`."
+                )
+            else:
+                self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping
+
+        if self.all_model_classes is None:
+            self.all_model_classes = self.model_tester.all_model_classes
+
+    def test_config(self):
+        """Test config common functionality."""
+        self.config_tester.run_common_tests()