transformers/tests/models/pe_video/test_modeling_pe_video.py

# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest

from transformers import PeVideoConfig, PeVideoEncoderConfig
from transformers.testing_utils import (
    require_torch,
    slow,
    torch_device,
)
from transformers.utils import is_torch_available

from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
    ModelTesterMixin,
    floats_tensor,
    ids_tensor,
    random_attention_mask,
    require_torch_gpu,
)


if is_torch_available():
    import torch

    from transformers import (
        ModernBertConfig,
        PeVideoEncoder,
        PeVideoModel,
    )


class PeVideoEncoderTester:
    def __init__(
        self,
        parent,
        config_kwargs={
            "vision_config": {
                "architecture": "vit_pe_core_large_patch14_336",
                "model_args": {
                    "embed_dim": 64,
                    "img_size": (14, 14),
                    "depth": 2,
                },
                "num_classes": 4,
            },
            "hidden_size": 32,
            "intermediate_size": 37,
            "num_hidden_layers": 2,
            "num_attention_heads": 2,
            "num_key_value_heads": 2,
            "head_dim": 16,
            "hidden_act": "silu",
            "max_position_embeddings": 512,
            "initializer_range": 0.02,
            "rms_norm_eps": 1e-5,
            "use_cache": True,
            "rope_theta": 20000,
            "rope_scaling": None,
            "attention_bias": False,
            "max_window_layers": 28,
            "attention_dropout": 0.0,
        },
        batch_size=4,
        num_frames=8,
        num_channels=3,
        is_training=True,
    ):
        self.parent = parent

        self.config_kwargs = config_kwargs
        for key, value in config_kwargs.items():
            setattr(self, key, value)

        self.batch_size = batch_size
        self.num_frames = num_frames
        self.num_channels = num_channels
        self.is_training = is_training

    @property
    def seq_length(self):
        # seq_length is what gets fed to the transformer
        # we add 1 because we add the class token
        return self.num_frames + 1

    def prepare_config_and_inputs(self):
        pixel_values_videos = floats_tensor(
            [
                self.batch_size,
                self.num_frames,
                self.num_channels,
                self.config_kwargs["vision_config"]["model_args"]["img_size"][0],
                self.config_kwargs["vision_config"]["model_args"]["img_size"][1],
            ]
        )
        # Generate valid_lengths in range [1, num_frames] to ensure at least one valid frame
        valid_lengths = ids_tensor([self.batch_size], self.num_frames - 1) + 1
        padding_mask_videos = torch.arange(self.num_frames, device=torch_device).unsqueeze(0) < valid_lengths[:, None]
        padding_mask_videos = padding_mask_videos.int()
        config = self.get_config()

        return config, pixel_values_videos, padding_mask_videos

    def get_config(self):
        return PeVideoEncoderConfig(**self.config_kwargs)

    def create_and_check_model(self, config, pixel_values_videos, padding_mask_videos):
        model = PeVideoEncoder(config=config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(pixel_values_videos, padding_mask_videos=padding_mask_videos)
        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, pixel_values_videos, padding_mask_videos = config_and_inputs
        inputs_dict = {"pixel_values_videos": pixel_values_videos, "padding_mask_videos": padding_mask_videos}
        return config, inputs_dict


@require_torch
class PeVideoEncoderTest(ModelTesterMixin, unittest.TestCase):
    all_model_classes = (PeVideoEncoder,)
    test_resize_embeddings = False
    _is_composite = True

    def setUp(self):
        self.model_tester = PeVideoEncoderTester(self)
        self.config_tester = ConfigTester(
            self, config_class=PeVideoEncoderConfig, has_text_modality=False, hidden_size=37
        )

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    @unittest.skip(reason="The model has TimmWrapper backbone but doesn't apply any conversion")
    def test_reverse_loading_mapping(self, check_keys_were_modified=True):
        pass

    @unittest.skip(reason="Timm Eva (PE) weights cannot be fully constructed in _init_weights")
    def test_can_init_all_missing_weights(self):
        pass

    @unittest.skip(reason="PeVideoEncoder does not have usual input embeddings")
    def test_model_get_set_embeddings(self):
        pass

    @unittest.skip("Cannot set `output_attentions` for timm models.")
    def test_attention_outputs(self):
        pass

    @unittest.skip("TimmWrapperModel cannot be tested with meta device")
    def test_can_be_initialized_on_meta(self):
        pass

    @unittest.skip("TimmWrapperModel cannot be tested with meta device")
    def test_can_load_with_meta_device_context_manager(self):
        pass

    @unittest.skip("Cannot set `output_attentions` for timm models.")
    def test_retain_grad_hidden_states_attentions(self):
        pass

    @unittest.skip(reason="PeVideoEncoder does not support feedforward chunking yet")
    def test_feed_forward_chunking(self):
        pass

    @unittest.skip(reason="PeAudioModel uses some timm stuff not compatible")
    def test_save_load(self):
        pass

    @unittest.skip(reason="@eustlb this is not really expected")
    def test_batching_equivalence(self):
        pass


class PeVideoTextModelTester:
    """
    Only a ModelTester and no PeVideoTextModelTest since text model is ModernBertModel that is already tested.
    """

    def __init__(
        self,
        parent,
        config_kwargs={
            "vocab_size": 99,
            "pad_token_id": 0,
            "hidden_size": 32,
            "num_hidden_layers": 2,
            "num_attention_heads": 4,
            "intermediate_size": 37,
            "hidden_activation": "gelu",
            "mlp_dropout": 0.0,
            "attention_dropout": 0.0,
            "embedding_dropout": 0.0,
            "classifier_dropout": 0.0,
            "max_position_embeddings": 512,
            "type_vocab_size": 16,
            "is_decoder": False,
            "initializer_range": 0.02,
        },
        batch_size=4,
        seq_length=7,
        is_training=True,
        use_input_mask=True,
        use_labels=True,
    ):
        self.parent = parent

        self.config_kwargs = config_kwargs
        for key, value in config_kwargs.items():
            setattr(self, key, value)

        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_input_mask = use_input_mask
        self.use_labels = use_labels

    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask([self.batch_size, self.seq_length])

        config = self.get_config()

        return config, input_ids, input_mask

    def get_config(self):
        return ModernBertConfig(**self.config_kwargs)

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, input_ids, input_mask = config_and_inputs
        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
        return config, inputs_dict


class PeVideoModelTester:
    def __init__(self, parent, text_kwargs=None, video_kwargs=None, is_training=True):
        if text_kwargs is None:
            text_kwargs = {}
        if video_kwargs is None:
            video_kwargs = {}

        self.parent = parent
        self.text_model_tester = PeVideoTextModelTester(parent, **text_kwargs)
        self.video_model_tester = PeVideoEncoderTester(parent, **video_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training

    def prepare_config_and_inputs(self):
        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
        _, pixel_values_videos, padding_mask_videos = self.video_model_tester.prepare_config_and_inputs()

        config = self.get_config()

        return config, input_ids, attention_mask, pixel_values_videos, padding_mask_videos

    def get_config(self):
        text_config = self.text_model_tester.get_config()
        video_config = self.video_model_tester.get_config()
        return PeVideoConfig(
            text_config=text_config.to_dict(),
            video_config=video_config.to_dict(),
            projection_dim=32,
        )

    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values_videos, padding_mask_videos):
        model = PeVideoModel(config).to(torch_device).eval()
        with torch.no_grad():
            _ = model(input_ids, pixel_values_videos, attention_mask, padding_mask_videos)

        # TODO: there is no logits per video for now
        # self.parent.assertEqual(result.logits_per_video.shape, (self.video_model_tester.batch_size, self.text_model_tester.batch_size))
        # self.parent.assertEqual(result.logits_per_text.shape, (self.text_model_tester.batch_size, self.video_model_tester.batch_size))

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, input_ids, attention_mask, pixel_values_videos, padding_mask_videos = config_and_inputs
        inputs_dict = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "pixel_values_videos": pixel_values_videos,
            "padding_mask_videos": padding_mask_videos,
        }
        return config, inputs_dict


@require_torch
class PeVideoModelTest(ModelTesterMixin, unittest.TestCase):
    # TODO: add PipelineTesterMixin
    all_model_classes = (PeVideoModel,)
    additional_model_inputs = ["pixel_values_videos", "padding_mask_videos"]
    test_resize_embeddings = False
    has_attentions = False
    _is_composite = True

    def setUp(self):
        self.model_tester = PeVideoModelTester(self)
        self.config_tester = ConfigTester(
            self, config_class=PeVideoConfig, has_text_modality=False, common_properties=[], hidden_size=37
        )

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    @unittest.skip(reason="The model has TimmWrapper backbone but doesn't apply any conversion")
    def test_reverse_loading_mapping(self, check_keys_were_modified=True):
        pass

    @unittest.skip(reason="PeVideoModel does not have usual input embeddings")
    def test_model_get_set_embeddings(self):
        pass

    @unittest.skip(
        "TimmWrapperForImageClassification does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet."
    )
    def test_can_set_attention_dynamically_composite_model(self):
        pass

    @unittest.skip(reason="Hidden_states is tested in individual model tests")
    def test_hidden_states_output(self):
        pass

    @unittest.skip(reason="Retain_grad is tested in individual model tests")
    def test_retain_grad_hidden_states_attentions(self):
        pass

    @unittest.skip(reason="PeVideoModel does not support feed forward chunking yet")
    def test_feed_forward_chunking(self):
        pass

    @unittest.skip("#TODO @eustlb this should be fixed tho")
    def test_save_load(self):
        pass

    @unittest.skip(reason="@eustlb this is not really expected")
    def test_batching_equivalence(self):
        pass

    @unittest.skip(reason="@eustlb this is not really expected")
    def test_can_init_all_missing_weights(self):
        pass

    @require_torch_gpu  # pe-video contains triton code which cannot run on CPU, so we only test on GPU
    def test_all_tensors_are_parameter_or_buffer(self):
        super().test_all_tensors_are_parameter_or_buffer()


@require_torch
class PeVideoIntegrationTest(unittest.TestCase):
    @slow
    def test_inference(self):
        # TODO: Add integration test when pretrained model is available
        pass