# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Testing suite for the PyTorch ColModernVBert model.""" import unittest from typing import ClassVar from huggingface_hub import hf_hub_download from PIL import Image from tests.test_configuration_common import ConfigTester from tests.test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from transformers import ( is_torch_available, ) from transformers.models.colmodernvbert.configuration_colmodernvbert import ColModernVBertConfig from transformers.models.colmodernvbert.modeling_colmodernvbert import ( ColModernVBertForRetrieval, ColModernVBertForRetrievalOutput, ) from transformers.models.colmodernvbert.processing_colmodernvbert import ColModernVBertProcessor from transformers.testing_utils import ( cleanup, require_torch, require_vision, slow, torch_device, ) if is_torch_available(): import torch class ColModernVBertForRetrievalModelTester: def __init__( self, parent, batch_size=2, num_images=2, seq_length=7, ignore_index=-100, text_config=None, is_training=False, vision_config=None, pixel_shuffle_factor=2, embedding_dim=64, ): if text_config is None: text_config = { "vocab_size": 99, "pad_token_id": 0, "hidden_size": 32, "num_hidden_layers": 2, "num_attention_heads": 2, "intermediate_size": 64, "hidden_activation": "gelu", "mlp_dropout": 0.1, "embedding_dropout": 0.1, "classifier_dropout": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "is_decoder": False, "initializer_range": 0.02, "reference_compile": False, } if vision_config is None: vision_config = { "image_size": 16, "patch_size": 4, "hidden_size": 64, "num_hidden_layers": 2, "num_attention_heads": 4, "intermediate_size": 32, "dropout": 0.1, "attention_dropout": 0.1, "initializer_range": 0.02, "vision_use_head": False, } self.is_training = is_training self.parent = parent self.batch_size = batch_size self.text_config = text_config self.vision_config = vision_config self.num_images = num_images self.image_size = vision_config["image_size"] self.pixel_shuffle_factor = pixel_shuffle_factor self.image_token_id = self.text_config["vocab_size"] - 1 self.pad_token_id = text_config["pad_token_id"] self.image_seq_length = ( int(((vision_config["image_size"] // vision_config["patch_size"]) ** 2) / (pixel_shuffle_factor**2)) * self.num_images ) self.seq_length = seq_length + self.image_seq_length self.hidden_size = text_config["hidden_size"] self.num_hidden_layers = text_config["num_hidden_layers"] self.num_attention_heads = text_config["num_attention_heads"] self.ignore_index = ignore_index self.embedding_dim = embedding_dim self.vlm_config = { "model_type": "modernvbert", "text_config": self.text_config, "vision_config": self.vision_config, "image_token_id": self.image_token_id, "pixel_shuffle_factor": self.pixel_shuffle_factor, } def get_config(self): config = ColModernVBertConfig( vlm_config=self.vlm_config, embedding_dim=self.embedding_dim, ) return config def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_images, 3, self.image_size, self.image_size]) config = self.get_config() return config, pixel_values def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs input_ids = ids_tensor([self.batch_size, self.seq_length], config.vlm_config.text_config.vocab_size) attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) # For simplicity just set the first n tokens to the image token input_ids[input_ids == self.image_token_id] = self.pad_token_id input_ids[:, : self.image_seq_length] = self.image_token_id attention_mask = input_ids.ne(1).to(torch_device) inputs_dict = { "pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask, } return config, inputs_dict @require_torch class ColModernVBertForRetrievalModelTest(ModelTesterMixin, unittest.TestCase): """ Model tester for `ColModernVBertForRetrieval`. """ all_model_classes = (ColModernVBertForRetrieval,) if is_torch_available() else () test_resize_embeddings = True test_missing_keys = False model_split_percents = [0.5, 0.8, 0.9] def setUp(self): self.model_tester = ColModernVBertForRetrievalModelTester(self) self.config_tester = ConfigTester(self, config_class=ColModernVBertConfig, has_text_modality=False) @require_vision def test_colmodernvbert_forward_inputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) model.to(torch_device) model.eval() inputs = self._prepare_for_class(inputs_dict, model_class) with torch.no_grad(): outputs = model(**inputs, return_dict=True) self.assertIsInstance(outputs, ColModernVBertForRetrievalOutput) @unittest.skip(reason="Error related to ModernBERT model parallelism: self.dtype is broken.") def test_multi_gpu_data_parallel_forward(self): pass @require_torch class ColModernVBertModelIntegrationTest(unittest.TestCase): model_name: ClassVar[str] = "paultltc/colmodernvbert_hf" def setUp(self): self.model_dtype = torch.float32 self.processor = ColModernVBertProcessor.from_pretrained(self.model_name) self.model = ( ColModernVBertForRetrieval.from_pretrained( self.model_name, dtype=self.model_dtype, ) .to(torch_device) .eval() ) def tearDown(self): cleanup(torch_device, gc_collect=True) @slow def test_model_integration_test(self): """ Test if the model is able to retrieve the correct pages for a small and easy dataset. """ # Load the test dataset queries = [ "A paint on the wall", "ColModernVBERT matches the performance of models nearly 10x larger on visual document benchmarks.", ] images = [ Image.open(hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/rococo.jpg", repo_type="space")), Image.open(hf_hub_download("ModernVBERT/colmodernvbert", "table.png", repo_type="model")), ] # Preprocess the examples batch_queries = self.processor.process_queries(text=queries).to(torch_device) batch_images = self.processor.process_images(images=images).to(torch_device) # Run inference with torch.inference_mode(): image_embeddings = self.model(**batch_images).embeddings query_embeddings = self.model(**batch_queries).embeddings # Compute retrieval scores scores = self.processor.score_retrieval( query_embeddings=query_embeddings, passage_embeddings=image_embeddings, ) # (num_queries, num_passages) scores = torch.softmax(scores, dim=-1) self.assertTrue(scores.ndim == 2, f"Expected 2D tensor, got {scores.ndim}") ( self.assertTrue(scores.shape == (len(images), len(images))), (f"Expected shape {(len(images), len(images))}, got {scores.shape}"), ) # Check if the maximum scores per row are in the diagonal of the matrix score self.assertTrue((scores.argmax(axis=1) == torch.arange(len(images), device=scores.device)).all()) # Further validation: fine-grained check, with a hardcoded score from the original implementation expected_scores = torch.tensor( [[0.95181, 0.048189], [0.00057251, 0.99943]], dtype=scores.dtype, ) ( self.assertTrue(torch.allclose(scores, expected_scores, atol=1e-2)), f"Expected scores {expected_scores}, got {scores}", )