# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest

from transformers import PeAudioConfig, PeAudioEncoderConfig
from transformers.audio_utils import load_audio
from transformers.testing_utils import (
    require_torch,
    require_torch_gpu,
    slow,
    torch_device,
)
from transformers.utils import is_torch_available

from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
    ModelTesterMixin,
    floats_tensor,
    ids_tensor,
    random_attention_mask,
)


if is_torch_available():
    import torch

    from transformers import (
        ModernBertConfig,
        PeAudioEncoder,
        PeAudioFrameLevelModel,
        PeAudioModel,
    )


class PeAudioEncoderTester:
    def __init__(
        self,
        parent,
        config_kwargs={
            "dac_config": {
                "encoder_hidden_size": 16,
                "downsampling_ratios": [2, 4, 4],
                "decoder_hidden_size": 16,
                "n_codebooks": 6,
                "codebook_size": 512,
                "codebook_dim": 32,
                "quantizer_dropout": 0.0,
                "commitment_loss_weight": 0.25,
                "codebook_loss_weight": 1.0,
            },
            "hidden_size": 32,
            "intermediate_size": 37,
            "num_hidden_layers": 2,
            "num_attention_heads": 2,
            "num_key_value_heads": 2,
            "head_dim": 128,
            "hidden_act": "silu",
            "max_position_embeddings": 512,
            "initializer_range": 0.02,
            "rms_norm_eps": 1e-5,
            "use_cache": True,
            "rope_theta": 20000,
            "rope_scaling": None,
            "attention_bias": False,
            "max_window_layers": 28,
            "attention_dropout": 0.0,
        },
        batch_size=12,
        num_channels=1,
        audio_seq_length=160,
        is_training=True,
    ):
        self.parent = parent

        self.config_kwargs = config_kwargs
        for key, value in config_kwargs.items():
            setattr(self, key, value)

        self.batch_size = batch_size
        self.num_channels = num_channels
        self.audio_seq_length = audio_seq_length
        self.is_training = is_training

    @property
    def seq_length(self):
        config = self.get_config()
        # seq_length is what gets feeded to the transformer
        # we first have to divide by hop_length to get the number of frames
        # then we add 1 because we add the class token
        return self.audio_seq_length // config.dac_config.hop_length + 1

    def prepare_config_and_inputs(self):
        input_values = floats_tensor([self.batch_size, self.num_channels, self.audio_seq_length])
        # Generate valid_lengths in range [1, self.audio_seq_length] to ensure at least one valid frame
        valid_lengths = ids_tensor([self.batch_size], self.audio_seq_length - 1) + 1
        padding_mask = torch.arange(self.audio_seq_length, device=torch_device)[None, :] < valid_lengths[:, None]
        padding_mask = padding_mask.int()
        config = self.get_config()

        return config, input_values, padding_mask

    def get_config(self):
        if not hasattr(self, "_config"):
            self._config = PeAudioEncoderConfig(**self.config_kwargs)
        return self._config

    def create_and_check_model(self, config, input_values, padding_mask):
        model = PeAudioEncoder(config=config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(input_values, padding_mask=padding_mask)
        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, input_values, padding_mask = config_and_inputs
        inputs_dict = {"input_values": input_values, "padding_mask": padding_mask}
        return config, inputs_dict


@require_torch
class PeAudioEncoderTest(ModelTesterMixin, unittest.TestCase):
    all_model_classes = (PeAudioEncoder,)
    test_resize_embeddings = False
    _is_composite = True

    def setUp(self):
        self.model_tester = PeAudioEncoderTester(self)
        self.config_tester = ConfigTester(
            self, config_class=PeAudioEncoderConfig, has_text_modality=False, hidden_size=37
        )

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    @unittest.skip(reason="PeAudioEncoder does not have usual input embeddings")
    def test_model_get_set_embeddings(self):
        pass

    @unittest.skip("PeAudioEncoder does not support feed forward chunking")
    def test_feed_forward_chunking(self):
        pass

    @unittest.skip(reason="SDPA can't dispatch on flash with not None `attention_mask`")
    def test_sdpa_can_dispatch_on_flash(self):
        pass


class PeAudioTextModelTester:
    """
    Only a ModelTester and no PeAudioTextModelTest since text model is ModernBertModel that is already tested.
    """

    def __init__(
        self,
        parent,
        config_kwargs={
            "vocab_size": 99,
            "pad_token_id": 0,
            "hidden_size": 32,
            "num_hidden_layers": 2,
            "num_attention_heads": 4,
            "intermediate_size": 37,
            "hidden_activation": "gelu",
            "mlp_dropout": 0.0,
            "attention_dropout": 0.0,
            "embedding_dropout": 0.0,
            "classifier_dropout": 0.0,
            "max_position_embeddings": 512,
            "type_vocab_size": 16,
            "is_decoder": False,
            "initializer_range": 0.02,
        },
        batch_size=12,
        seq_length=7,
        is_training=True,
        use_input_mask=True,
        use_labels=True,  # TODO: to check
    ):
        self.parent = parent

        self.config_kwargs = config_kwargs
        for key, value in config_kwargs.items():
            setattr(self, key, value)

        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_input_mask = use_input_mask
        self.use_labels = use_labels

    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask([self.batch_size, self.seq_length])

        config = self.get_config()

        return config, input_ids, input_mask

    def get_config(self):
        return ModernBertConfig(**self.config_kwargs)

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, input_ids, input_mask = config_and_inputs
        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
        return config, inputs_dict


class PeAudioModelTester:
    def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True):
        if text_kwargs is None:
            text_kwargs = {}
        if audio_kwargs is None:
            audio_kwargs = {}

        self.parent = parent
        self.text_model_tester = PeAudioTextModelTester(parent, **text_kwargs)
        self.audio_model_tester = PeAudioEncoderTester(parent, **audio_kwargs)
        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training

    def prepare_config_and_inputs(self):
        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
        _, input_values, padding_mask = self.audio_model_tester.prepare_config_and_inputs()

        config = self.get_config()

        return config, input_ids, attention_mask, input_values, padding_mask

    def get_config(self):
        text_config = self.text_model_tester.get_config()
        audio_config = self.audio_model_tester.get_config()
        return PeAudioConfig(
            text_config=text_config.to_dict(),
            audio_config=audio_config.to_dict(),
            projection_dim=32,
        )

    def create_and_check_model(self, config, input_ids, attention_mask, input_values, padding_mask):
        model = PeAudioModel(config).to(torch_device).eval()
        with torch.no_grad():
            _ = model(input_ids, input_values, attention_mask, padding_mask)

        # TODO: there is no logits per audio for now
        # self.parent.assertEqual(result.logits_per_audio.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size))
        # self.parent.assertEqual(result.logits_per_text.shape, (self.text_model_tester.batch_size, self.audio_model_tester.batch_size))

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, input_ids, attention_mask, input_values, padding_mask = config_and_inputs
        inputs_dict = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "input_values": input_values,
            "padding_mask": padding_mask,
        }
        return config, inputs_dict


@require_torch
class PeAudioModelTest(ModelTesterMixin, unittest.TestCase):
    # TODO: add PipelineTesterMixin
    all_model_classes = (PeAudioModel,)
    additional_model_inputs = ["input_values", "padding_mask"]
    test_resize_embeddings = False
    has_attentions = False
    _is_composite = True

    def setUp(self):
        self.model_tester = PeAudioModelTester(self)
        self.config_tester = ConfigTester(
            self, config_class=PeAudioConfig, has_text_modality=False, common_properties=[], hidden_size=37
        )

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    @unittest.skip(reason="PeAudioModel does not have usual input embeddings")
    def test_model_get_set_embeddings(self):
        pass

    @unittest.skip(reason="Hidden_states is tested in individual model tests")
    def test_hidden_states_output(self):
        pass

    @unittest.skip(reason="Retain_grad is tested in individual model tests")
    def test_retain_grad_hidden_states_attentions(self):
        pass

    @unittest.skip(reason="PeAudioModel does not support feed forward chunking yet")
    def test_feed_forward_chunking(self):
        pass

    @unittest.skip(reason="PeAudioModel uses some timm stuff not compatible")
    def test_save_load(self):
        pass

    @unittest.skip(reason="@eustlb this is not really expected")
    def test_batching_equivalence(self):
        pass

    @unittest.skip(reason="@eustlb this is not really expected")
    def test_can_init_all_missing_weights(self):
        pass

    @require_torch_gpu  # pe-audio contains triton code which cannot run on CPU, so we only test on GPU
    def test_all_tensors_are_parameter_or_buffer(self):
        super().test_all_tensors_are_parameter_or_buffer()


@require_torch
class PeAudioIntegrationTest(unittest.TestCase):
    def setUp(self):
        self.checkpoint_name = "/raid/eustache/sam-audio/pe-a-frame-small"
        self.dtype = torch.float32

    @slow
    @unittest.skip(reason="TODO when released")
    def test_inference(self):
        checkpoint_name = "/raid/eustache/sam-audio/pe-av-small"
        descriptions = ["glass breaking", "somebody speaking"]
        audio_file = "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/glass_breaking.mp3"

        # processor = PeAudioProcessor.from_pretrained(checkpoint_name)
        model = PeAudioModel.from_pretrained(checkpoint_name, dtype=self.dtype, device_map=torch_device)

        inputs = self.processor(
            text=descriptions,
            audio=[load_audio(audio_file, self.processor.feature_extractor.sampling_rate)],
            return_tensors="pt",
            padding=True,
        )
        inputs = inputs.to(torch_device, dtype=self.dtype)
        model(**inputs)

    @slow
    @unittest.skip(reason="TODO when released")
    def test_inference_frame_level(self):
        checkpoint_name = "/raid/eustache/sam-audio/pe-a-frame-small"
        descriptions = ["glass breaking", "somebody speaking"]
        audio_file = "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/glass_breaking.mp3"

        # processor = PeAudioProcessor.from_pretrained(checkpoint_name)
        model = PeAudioFrameLevelModel.from_pretrained(checkpoint_name, dtype=self.dtype, device_map=torch_device)

        inputs = self.processor(
            text=descriptions,
            audio=[load_audio(audio_file, self.processor.feature_extractor.sampling_rate)],
            return_tensors="pt",
            padding=True,
        )
        inputs = inputs.to(torch_device, dtype=self.dtype)

        outputs = model(**inputs)
        #
        # TODO: this should be incorporated into the `forward` pass itself
        threshold = 0.3
        logits_per_audio = outputs.logits_per_audio
        probs_per_audio = logits_per_audio.sigmoid()
        preds = probs_per_audio > threshold

        # fmt: off
        EXPECTED = torch.tensor([
            [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True],
            [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True]
        ])
        # fmt: on
        torch.testing.assert_close(preds, EXPECTED)