# Copyright 2024-2025 NVIDIA Corporation and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch NemotronH model."""

import tempfile
import unittest

import pytest
from huggingface_hub.errors import StrictDataclassClassValidationError

from transformers import AutoTokenizer, NemotronHConfig, NemotronHForCausalLM, is_torch_available
from transformers.testing_utils import (
    require_bitsandbytes,
    require_flash_attn,
    require_torch,
    require_torch_accelerator,
    slow,
    torch_device,
)
from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
from ...test_pipeline_mixin import PipelineTesterMixin


if is_torch_available():
    import torch

    from transformers import DynamicCache, NemotronHForCausalLM, NemotronHModel


class NemotronHModelTester:
    def __init__(
        self,
        parent,
        batch_size=13,
        seq_length=7,
        is_training=True,
        use_input_mask=True,
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
        layers_block_type=["mamba", "moe", "mamba", "attention", "moe"],
        num_attention_heads=4,
        num_key_value_heads=2,
        head_dim=32,
        intermediate_size=40,
        moe_intermediate_size=40,
        moe_shared_expert_intermediate_size=40,
        mlp_hidden_act="relu2",
        mamba_hidden_act="silu",
        max_position_embeddings=512,
        type_sequence_label_size=2,
        initializer_range=0.02,
        num_labels=3,
        num_choices=4,
        # Mamba-specific params
        ssm_state_size=16,
        mamba_num_heads=8,
        mamba_n_groups=8,
        mamba_head_dim=16,
        mamba_d_conv=4,
        mamba_expand=2,
        mamba_chunk_size=64,
        # MoE params
        n_routed_experts=8,
        n_shared_experts=1,
        num_experts_per_tok=2,
    ):
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.is_training = is_training
        self.use_input_mask = use_input_mask
        self.use_labels = use_labels
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.layers_block_type = layers_block_type
        # num_hidden_layers is now derived from layers_block_type length
        self.num_hidden_layers = len(layers_block_type)
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.head_dim = head_dim
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
        self.mlp_hidden_act = mlp_hidden_act
        self.mamba_hidden_act = mamba_hidden_act
        self.max_position_embeddings = max_position_embeddings
        self.type_sequence_label_size = type_sequence_label_size
        self.initializer_range = initializer_range
        self.num_labels = num_labels
        self.num_choices = num_choices

        # Mamba params
        self.ssm_state_size = ssm_state_size
        self.mamba_num_heads = mamba_num_heads
        self.mamba_n_groups = mamba_n_groups
        self.mamba_head_dim = mamba_head_dim
        self.mamba_d_conv = mamba_d_conv
        self.mamba_expand = mamba_expand
        self.mamba_chunk_size = mamba_chunk_size

        # MoE params
        self.n_routed_experts = n_routed_experts
        self.n_shared_experts = n_shared_experts
        self.num_experts_per_tok = num_experts_per_tok

    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask([self.batch_size, self.seq_length])

        sequence_labels = None
        token_labels = None
        choice_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)

        config = self.get_config()

        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels

    def get_config(self):
        return NemotronHConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            layers_block_type=self.layers_block_type,
            num_attention_heads=self.num_attention_heads,
            num_key_value_heads=self.num_key_value_heads,
            head_dim=self.head_dim,
            intermediate_size=self.intermediate_size,
            moe_intermediate_size=self.moe_intermediate_size,
            moe_shared_expert_intermediate_size=self.moe_shared_expert_intermediate_size,
            mlp_hidden_act=self.mlp_hidden_act,
            mamba_hidden_act=self.mamba_hidden_act,
            max_position_embeddings=self.max_position_embeddings,
            is_decoder=True,
            initializer_range=self.initializer_range,
            use_mamba_kernels=False,
            ssm_state_size=self.ssm_state_size,
            mamba_num_heads=self.mamba_num_heads,
            mamba_n_groups=self.mamba_n_groups,
            mamba_head_dim=self.mamba_head_dim,
            mamba_d_conv=self.mamba_d_conv,
            mamba_expand=self.mamba_expand,
            mamba_chunk_size=self.mamba_chunk_size,
            n_routed_experts=self.n_routed_experts,
            n_shared_experts=self.n_shared_experts,
            num_experts_per_tok=self.num_experts_per_tok,
        )

    def prepare_config_and_inputs_for_decoder(self):
        (
            config,
            input_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
        ) = self.prepare_config_and_inputs()

        config.is_decoder = True

        return (
            config,
            input_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
        )

    def create_and_check_model(self, config, input_ids, input_mask, _sequence_labels, _token_labels, _choice_labels):
        model = NemotronHModel(config=config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask)
        result = model(input_ids)
        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))

    def create_and_check_for_causal_lm(
        self,
        config,
        input_ids,
        input_mask,
        _sequence_labels,
        token_labels,
        _choice_labels,
    ):
        model = NemotronHForCausalLM(config=config)
        model.to(torch_device)
        model.eval()
        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
        result = model(input_ids, attention_mask=input_mask)
        result = model(input_ids, labels=token_labels)
        result = model(input_ids)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))

    def create_and_check_decoder_model_past_large_inputs(
        self,
        config,
        input_ids,
        input_mask,
        _sequence_labels,
        _token_labels,
        _choice_labels,
    ):
        config.is_decoder = True
        config.add_cross_attention = False
        model = NemotronHForCausalLM(config=config)
        model.to(torch_device)
        model.eval()

        # first forward pass
        outputs = model(
            input_ids,
            attention_mask=input_mask,
            use_cache=True,
        )
        past_key_values = outputs.past_key_values

        # create hypothetical multiple next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
        next_mask = ids_tensor((self.batch_size, 1), vocab_size=2)

        # append to next input_ids and
        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)

        output_from_no_past = model(
            next_input_ids,
            attention_mask=next_attention_mask,
            output_hidden_states=True,
        )["hidden_states"][0]
        output_from_past = model(
            next_tokens,
            attention_mask=next_attention_mask,
            past_key_values=past_key_values,
            output_hidden_states=True,
        )["hidden_states"][0]

        # select random slice
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
        output_from_no_past_slice = output_from_no_past[:, -1:, random_slice_idx].detach()
        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()

        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])

        # test that outputs are equal for slice
        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))

    def create_and_check_mamba2_slow_vs_fast_forward(self, config, input_ids, *args):
        """
        Test that cuda_kernels_forward and torch_forward produce consistent outputs.
        This ensures that the optimized CUDA kernel path and the pure PyTorch path
        are equivalent.
        """
        model = NemotronHModel(config)
        model.eval()

        if not (is_mamba_ssm_available() and is_causal_conv1d_available()):
            self.parent.skipTest(
                "This test needs the Mamba2 fast path. Skipping as the necessary packages have not been found."
            )
        if torch_device != "cuda":
            self.parent.skipTest("This test needs the Mamba2 fast path. Skipping as we need a cuda capable device.")

        model.to(torch_device)

        # Get the first mamba layer for testing
        # Find the index of the first mamba layer
        mamba_layer_idx = None
        for idx, layer_type in enumerate(config.layers_block_type):
            if layer_type == "mamba":
                mamba_layer_idx = idx
                break

        if mamba_layer_idx is None:
            self.parent.skipTest("No mamba layer found in the model configuration.")

        # Get embeddings
        token_emb = model.embeddings(input_ids.to(torch_device))

        # Get the mamba mixer from the first mamba block
        mamba_mixer = model.layers[mamba_layer_idx].mixer

        # Test without cache
        outputs_fast = mamba_mixer.cuda_kernels_forward(token_emb)
        outputs_slow = mamba_mixer.torch_forward(token_emb)

        self.parent.assertTrue(torch.allclose(outputs_fast, outputs_slow, atol=1e-3, rtol=1e-3))

        # Test with cache
        cache_params = DynamicCache(config=config)
        outputs_fast_cached = mamba_mixer.cuda_kernels_forward(token_emb, cache_params=cache_params)

        # Reset cache for fair comparison
        cache_params_slow = DynamicCache(config=config)
        outputs_slow_cached = mamba_mixer.torch_forward(token_emb, cache_params=cache_params_slow)

        self.parent.assertTrue(torch.allclose(outputs_fast_cached, outputs_slow_cached, atol=1e-3, rtol=1e-3))

    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (
            config,
            input_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
        ) = config_and_inputs
        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
        return config, inputs_dict


@require_torch
class NemotronHModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
    all_model_classes = (
        (
            NemotronHModel,
            NemotronHForCausalLM,
        )
        if is_torch_available()
        else ()
    )
    pipeline_model_mapping = (
        {
            "feature-extraction": NemotronHModel,
            "text-generation": NemotronHForCausalLM,
        }
        if is_torch_available()
        else {}
    )

    def _get_conv_state_shape(self, batch_size: int, config):
        intermediate_size = config.mamba_num_heads * config.mamba_head_dim
        conv_shape = (
            batch_size,
            intermediate_size + 2 * config.n_groups * config.ssm_state_size,
            config.conv_kernel,
        )
        return conv_shape

    def _get_recurrent_state_shape(self, batch_size: int, config):
        return (batch_size, config.mamba_num_heads, config.mamba_head_dim, config.ssm_state_size)

    def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config):
        # Raise a useful error, asking to explicitly override the method
        if not isinstance(past_key_values, DynamicCache):
            raise ValueError("The cache does not use the correct Cache")

        # Use the correct config
        config = config.get_text_config(decoder=True)

        # (batch, kv heads, seq_length, head_dim)
        # Only pure mamba models do not have num_attention_heads defined in config, so it can never be 1 in practice for attention models
        num_attention_heads = getattr(config, "num_attention_heads", 1)
        num_kv_heads = getattr(config, "num_key_value_heads", num_attention_heads)
        hidden_size = getattr(config, "d_model", config.hidden_size)
        head_dim = getattr(config, "head_dim", hidden_size // num_attention_heads)

        # For cross attention cache, the seq_length depends on the model, so we remove that dim
        attention_shape = (batch_size, num_kv_heads, seq_length, head_dim)
        # For mamba layers
        conv_shape = self._get_conv_state_shape(batch_size, config)
        recurrent_shape = self._get_recurrent_state_shape(batch_size, config)

        # Check each layer has the correct shape
        for layer, layer_type in zip(past_key_values.layers, config.layer_types):
            # Moe layers have a default mamba cache instantiated, but it stays empty as the layer does not use it
            if layer_type == "moe":
                self.assertEqual(layer.conv_states, None)
                self.assertEqual(layer.recurrent_states, None)
            # Attention layer cache
            elif layer_type == "attention":
                self.assertEqual(layer.keys.shape, attention_shape)
                self.assertEqual(layer.values.shape, attention_shape)
            # Mamba layer cache
            elif layer_type == "mamba":
                self.assertEqual(layer.conv_states.shape, conv_shape)
                self.assertEqual(layer.recurrent_states.shape, recurrent_shape)
            else:
                raise ValueError("Unknown layer type.")

    def setUp(self):
        self.model_tester = NemotronHModelTester(self)
        self.config_tester = ConfigTester(
            self, config_class=NemotronHConfig, common_properties=["hidden_size", "num_attention_heads"]
        )
        # Save original settings
        self._original_deterministic = torch.are_deterministic_algorithms_enabled()
        self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
        self._original_cudnn_benchmark = torch.backends.cudnn.benchmark
        # Apply deterministic settings for NemotronH tests
        torch.use_deterministic_algorithms(True)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    def tearDown(self):
        # Restore original settings
        torch.use_deterministic_algorithms(self._original_deterministic)
        torch.backends.cudnn.deterministic = self._original_cudnn_deterministic
        torch.backends.cudnn.benchmark = self._original_cudnn_benchmark

    @unittest.skip(reason="NemotronH needs at least 3 layers to test (mamba, moe, attention)")
    def test_num_layers_is_small(self):
        pass

    @unittest.skip("position_ids cannot be used to pad due to Mamba2 layers")
    def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
        pass

    @unittest.skip(reason="NemotronH has hybrid cache.")
    def test_generate_continue_from_inputs_embeds(self):
        pass

    @unittest.skip(reason="A large nemotron3 would be necessary (and costly) for that")
    def test_multi_gpu_data_parallel_forward(self):
        pass

    def test_reverse_loading_mapping(self):
        super().test_reverse_loading_mapping(skip_base_model=True)

    # TODO(liding):
    # in test_configuration_common.py, three tests failed
    # create_and_test_config_to_json_file
    # create_and_test_config_from_and_save_pretrained
    # create_and_test_config_from_and_save_pretrained_composite
    # def test_config(self):
    #     self.config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    def test_for_causal_lm(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)

    def test_decoder_model_past_with_large_inputs(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)

    def test_mamba2_slow_vs_fast_forward(self):
        """
        Test that cuda_kernels_forward and torch_forward produce consistent outputs.
        """
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_mamba2_slow_vs_fast_forward(*config_and_inputs)

    def test_attention_outputs(self):
        r"""
        Overriding the test_attention_outputs test as the NemotronH model outputs attention only for its attention layers
        """
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.return_dict = True

        seq_len = getattr(self.model_tester, "seq_length", None)
        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)

        # Count attention layers from hybrid pattern
        num_attention_layers = config.hybrid_override_pattern.count("*")

        for model_class in self.all_model_classes:
            print(f"Testing model class: {model_class}")
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = False
            config.return_dict = True
            model = model_class._from_config(config, attn_implementation="eager")
            config = model.config
            model.to(torch_device)
            model.eval()

            with torch.no_grad():
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs.attentions
            self.assertEqual(len(attentions), num_attention_layers)

            # check that output_attentions also work using config
            del inputs_dict["output_attentions"]
            config.output_attentions = True
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs.attentions
            self.assertEqual(len(attentions), num_attention_layers)

            if num_attention_layers > 0:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                )

            out_len = len(outputs)

            # Check attention is always last and order is fine
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = True
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))

            added_hidden_states = 1
            self.assertEqual(out_len + added_hidden_states, len(outputs))

            self_attentions = outputs.attentions
            self.assertEqual(len(self_attentions), num_attention_layers)

            if num_attention_layers > 0:
                self.assertListEqual(
                    list(self_attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                )

    @require_flash_attn
    @require_torch_accelerator
    @require_bitsandbytes
    @pytest.mark.flash_attn_test
    @slow
    def test_flash_attn_2_fp32_ln(self):
        r"""
        Overriding the test_flash_attn_2_fp32_ln test as the NemotronH model, like Zamba2, doesn't support
        right padding + use cache with FA2
        """
        from transformers import BitsAndBytesConfig

        for model_class in self.all_generative_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            model = model_class(config)

            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname)

                dummy_input = inputs_dict[model.main_input_name]
                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
                # NOTE: NemotronH does not support right padding + use_cache with FA2.
                dummy_attention_mask[:, -1] = 1

                model = model_class.from_pretrained(
                    tmpdirname,
                    dtype=torch.float16,
                    attn_implementation="flash_attention_2",
                    quantization_config=BitsAndBytesConfig(load_in_4bit=True),
                )

                for _, param in model.named_parameters():
                    # upcast only layer norms
                    if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
                        param.data = param.data.to(torch.float32)

                _ = model(dummy_input)
                # with attention mask
                _ = model(dummy_input, attention_mask=dummy_attention_mask)

    @require_torch_accelerator
    def test_flex_attention_with_grads(self):
        """
        Overwriting as the base hidden size is big enough for compile.
        Manipulation of dims causes issues due to other constraints not being satisfied anymore.
        """
        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            config._attn_implementation = "flex_attention"

            model = model_class(config).to(device=torch_device)
            self.assertTrue(model.config._attn_implementation == "flex_attention")

            # Elaborate workaround for encoder-decoder models as some do not specify their main input
            dummy_inputs = {model.main_input_name: inputs_dict[model.main_input_name].to(torch_device)}
            if config.is_encoder_decoder:
                dummy_inputs["decoder_input_ids"] = inputs_dict["decoder_input_ids"].to(torch_device)
                dummy_inputs["decoder_attention_mask"] = inputs_dict["decoder_attention_mask"].to(torch_device)

            # If this does not raise an error, the test passes (see https://github.com/huggingface/transformers/pull/35605)
            _ = model(**dummy_inputs)

    def test_layers_block_type_validation(self):
        """Test that layers_block_type is validated correctly"""

        # Valid list - should work
        config = NemotronHConfig(
            vocab_size=100, hidden_size=32, layers_block_type=["mamba", "moe", "attention", "moe"]
        )
        self.assertEqual(len(config.layers_block_type), 4)
        self.assertEqual(config.num_hidden_layers, 4)

        # Invalid layer type - should raise error
        with self.assertRaises(StrictDataclassClassValidationError):
            NemotronHConfig(
                vocab_size=100,
                hidden_size=32,
                layers_block_type=["mamba", "moe", "attention", "invalid"],  # "invalid" is not valid
            )

    def test_layers_block_type(self):
        """Test that layers_block_type works correctly and backward compatibility"""
        # Create config with explicit list
        config = NemotronHConfig(
            vocab_size=100, hidden_size=32, layers_block_type=["mamba", "moe", "attention", "moe"]
        )

        # Test direct access to layers_block_type
        self.assertEqual(config.layers_block_type[0], "mamba")
        self.assertEqual(config.layers_block_type[1], "moe")
        self.assertEqual(config.layers_block_type[2], "attention")
        self.assertEqual(config.layers_block_type[3], "moe")

        # Test that num_hidden_layers is derived from layers_block_type length
        self.assertEqual(config.num_hidden_layers, 4)

        # Test backward compatibility - hybrid_override_pattern property
        self.assertEqual(config.hybrid_override_pattern, "ME*E")

        # Test the model tester config
        config2 = self.model_tester.get_config()
        self.assertEqual(len(config2.layers_block_type), 5)
        self.assertEqual(config2.layers_block_type[0], "mamba")
        self.assertEqual(config2.layers_block_type[1], "moe")
        self.assertEqual(config2.layers_block_type[2], "mamba")
        self.assertEqual(config2.layers_block_type[3], "attention")
        self.assertEqual(config2.layers_block_type[4], "moe")

    def test_generate_with_and_without_cache(self):
        """Test that generation with and without cache produces identical outputs"""
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        config = config_and_inputs[0]

        # Create model
        model = NemotronHForCausalLM(config=config)
        model.to(torch_device)
        model.eval()

        # Create input for generation (smaller sequence for faster test)
        input_ids = ids_tensor([1, 5], config.vocab_size)  # batch_size=1, seq_len=5
        input_ids = input_ids.to(torch_device)

        # Set seed for reproducibility
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(0)

        # Generate with cache
        with torch.no_grad():
            print("running generate with cache")
            output_with_cache = model.generate(
                input_ids,
                max_new_tokens=5,
                do_sample=False,
                use_cache=True,
            )

        # Reset seed
        torch.manual_seed(0)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(0)

        # Generate without cache
        with torch.no_grad():
            print("running generate without cache")
            output_without_cache = model.generate(
                input_ids,
                max_new_tokens=5,
                do_sample=False,
                use_cache=False,
            )

        print(f"output_with_cache: {output_with_cache}")
        print(f"output_without_cache: {output_without_cache}")

        # Outputs should be identical
        self.assertTrue(
            torch.equal(output_with_cache, output_without_cache),
            msg=f"Outputs differ:\n  With cache: {output_with_cache}\n  Without cache: {output_without_cache}",
        )

    def test_legacy_hybrid_override_pattern(self):
        """Test backward compatibility with legacy hybrid_override_pattern"""
        # Create config using legacy hybrid_override_pattern
        config = NemotronHConfig(vocab_size=100, hidden_size=32, hybrid_override_pattern="ME*E")

        # Test that it's converted to layers_block_type
        self.assertEqual(config.layers_block_type, ["mamba", "moe", "attention", "moe"])
        self.assertEqual(config.num_hidden_layers, 4)
        self.assertEqual(config.hybrid_override_pattern, "ME*E")

        # Test with longer pattern
        config2 = NemotronHConfig(vocab_size=100, hidden_size=32, hybrid_override_pattern="MEME*EME")
        self.assertEqual(
            config2.layers_block_type, ["mamba", "moe", "mamba", "moe", "attention", "moe", "mamba", "moe"]
        )
        self.assertEqual(config2.num_hidden_layers, 8)

    def test_num_hidden_layers_deprecated(self):
        """Test that num_hidden_layers is now derived from layers_block_type length"""
        # Test that num_hidden_layers is derived from layers_block_type
        config = NemotronHConfig(layers_block_type=["mamba", "moe", "attention", "moe", "mamba", "attention"])
        self.assertEqual(config.num_hidden_layers, 6)

        # Test that num_hidden_layers parameter is ignored when layers_block_type is provided
        config2 = NemotronHConfig(
            layers_block_type=["mamba", "moe", "attention"],
            num_hidden_layers=10,  # This should be ignored
        )
        # Should use layers_block_type length, not the parameter
        self.assertEqual(config2.num_hidden_layers, 3)

    def test_legacy_config_json_loading(self):
        """Test loading legacy config.json with hybrid_override_pattern and num_hidden_layers"""
        import json

        # Create a legacy config.json
        legacy_config = {
            "model_type": "nemotron_3",
            "vocab_size": 100,
            "hidden_size": 32,
            "num_hidden_layers": 6,
            "hybrid_override_pattern": "MEME*E",
            "num_attention_heads": 4,
        }

        with tempfile.TemporaryDirectory() as tmpdir:
            config_path = f"{tmpdir}/config.json"
            with open(config_path, "w") as f:
                json.dump(legacy_config, f)

            # Load the config
            config = NemotronHConfig.from_json_file(config_path)

            # Verify conversion
            self.assertEqual(len(config.layers_block_type), 6)
            self.assertEqual(config.num_hidden_layers, 6)
            self.assertEqual(config.layers_block_type, ["mamba", "moe", "mamba", "moe", "attention", "moe"])
            self.assertEqual(config.hybrid_override_pattern, "MEME*E")

    def test_mtp_backward_compatibility(self):
        """Test MTP backward compatibility with mtp_hybrid_override_pattern"""
        config = NemotronHConfig(
            layers_block_type=["mamba", "moe", "attention", "moe"],
            num_nextn_predict_layers=2,
            mtp_hybrid_override_pattern="*E",
        )

        # Verify conversion
        self.assertEqual(config.mtp_layers_block_type, ["attention", "moe"])
        self.assertEqual(config.mtp_hybrid_override_pattern, "*E")

    def test_config_roundtrip_save_load(self):
        """Test that config can be saved and loaded correctly"""
        # Create config with new format
        config1 = NemotronHConfig(
            vocab_size=100, hidden_size=32, layers_block_type=["mamba", "attention", "moe", "attention"]
        )

        with tempfile.TemporaryDirectory() as tmpdir:
            # Save
            config1.save_pretrained(tmpdir)

            # Load
            config2 = NemotronHConfig.from_pretrained(tmpdir)

            # Verify
            self.assertEqual(config2.layers_block_type, ["mamba", "attention", "moe", "attention"])
            self.assertEqual(config2.num_hidden_layers, 4)
            self.assertEqual(config2.vocab_size, 100)
            self.assertEqual(config2.hidden_size, 32)

    def test_pattern_conversion_methods(self):
        """Test the pattern conversion utility methods"""
        # Test _pattern_to_list
        pattern = "M*EME*"
        layers_list = NemotronHConfig._pattern_to_list(pattern)
        self.assertEqual(layers_list, ["mamba", "attention", "moe", "mamba", "moe", "attention"])

        # Test _list_to_pattern
        layers_list = ["mamba", "moe", "attention", "moe"]
        pattern = NemotronHConfig._list_to_pattern(layers_list)
        self.assertEqual(pattern, "ME*E")

        # Test roundtrip
        original_pattern = "ME*ME*E"
        roundtrip_pattern = NemotronHConfig._list_to_pattern(NemotronHConfig._pattern_to_list(original_pattern))
        self.assertEqual(original_pattern, roundtrip_pattern)


@require_torch
class NemotronHModelIntegrationTest(unittest.TestCase):
    model = None
    tokenizer = None

    @classmethod
    @slow
    def setUpClass(cls):
        model_id = "dmax123/tiny-nemotron-dummy-weights"
        revision = "081dbac3061bb16c0c458c1798b1d9d7bc135c95"
        cls.model = NemotronHForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, revision=revision)
        cls.tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

    def setUp(self):
        # Save original settings
        self._original_deterministic = torch.are_deterministic_algorithms_enabled()
        self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
        self._original_cudnn_benchmark = torch.backends.cudnn.benchmark
        # Apply deterministic settings for NemotronH tests
        torch.use_deterministic_algorithms(True)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    def tearDown(self):
        # Restore original settings
        torch.use_deterministic_algorithms(self._original_deterministic)
        torch.backends.cudnn.deterministic = self._original_cudnn_deterministic
        torch.backends.cudnn.benchmark = self._original_cudnn_benchmark

    @slow
    def test_simple_generate(self):
        self.model.to(torch_device)

        prompt = "Hey how are you doing?"
        EXPECTED_TOKENS_IDS = torch.tensor(
            [1045, 1429, 1073, 4525, 1605, 1261, 4249, 1044, 2081, 2224], dtype=torch.int32
        )

        messages = [{"role": "user", "content": prompt}]
        tokenized_chat = self.tokenizer.apply_chat_template(
            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
        )
        input_ids = tokenized_chat["input_ids"].to(torch_device)
        prompt_length = input_ids.shape[1]

        outputs = self.model.generate(input_ids, do_sample=False, max_new_tokens=10)

        generated_tokens = outputs[0][prompt_length:]
        self.assertTrue(torch.equal(generated_tokens.cpu(), EXPECTED_TOKENS_IDS.cpu()))