first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/nemotron_h/test_modeling_nemotron_h.py
+++ b/tests/models/nemotron_h/test_modeling_nemotron_h.py
@@ -0,0 +1,857 @@
+# Copyright 2024-2025 NVIDIA Corporation and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch NemotronH model."""
+
+import tempfile
+import unittest
+
+import pytest
+from huggingface_hub.errors import StrictDataclassClassValidationError
+
+from transformers import AutoTokenizer, NemotronHConfig, NemotronHForCausalLM, is_torch_available
+from transformers.testing_utils import (
+    require_bitsandbytes,
+    require_flash_attn,
+    require_torch,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import DynamicCache, NemotronHForCausalLM, NemotronHModel
+
+
+class NemotronHModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        layers_block_type=["mamba", "moe", "mamba", "attention", "moe"],
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=32,
+        intermediate_size=40,
+        moe_intermediate_size=40,
+        moe_shared_expert_intermediate_size=40,
+        mlp_hidden_act="relu2",
+        mamba_hidden_act="silu",
+        max_position_embeddings=512,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        # Mamba-specific params
+        ssm_state_size=16,
+        mamba_num_heads=8,
+        mamba_n_groups=8,
+        mamba_head_dim=16,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_chunk_size=64,
+        # MoE params
+        n_routed_experts=8,
+        n_shared_experts=1,
+        num_experts_per_tok=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.layers_block_type = layers_block_type
+        # num_hidden_layers is now derived from layers_block_type length
+        self.num_hidden_layers = len(layers_block_type)
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
+        self.mlp_hidden_act = mlp_hidden_act
+        self.mamba_hidden_act = mamba_hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+
+        # Mamba params
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.mamba_n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_chunk_size = mamba_chunk_size
+
+        # MoE params
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.num_experts_per_tok = num_experts_per_tok
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return NemotronHConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            layers_block_type=self.layers_block_type,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            head_dim=self.head_dim,
+            intermediate_size=self.intermediate_size,
+            moe_intermediate_size=self.moe_intermediate_size,
+            moe_shared_expert_intermediate_size=self.moe_shared_expert_intermediate_size,
+            mlp_hidden_act=self.mlp_hidden_act,
+            mamba_hidden_act=self.mamba_hidden_act,
+            max_position_embeddings=self.max_position_embeddings,
+            is_decoder=True,
+            initializer_range=self.initializer_range,
+            use_mamba_kernels=False,
+            ssm_state_size=self.ssm_state_size,
+            mamba_num_heads=self.mamba_num_heads,
+            mamba_n_groups=self.mamba_n_groups,
+            mamba_head_dim=self.mamba_head_dim,
+            mamba_d_conv=self.mamba_d_conv,
+            mamba_expand=self.mamba_expand,
+            mamba_chunk_size=self.mamba_chunk_size,
+            n_routed_experts=self.n_routed_experts,
+            n_shared_experts=self.n_shared_experts,
+            num_experts_per_tok=self.num_experts_per_tok,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask, _sequence_labels, _token_labels, _choice_labels):
+        model = NemotronHModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        _sequence_labels,
+        token_labels,
+        _choice_labels,
+    ):
+        model = NemotronHForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids, labels=token_labels)
+        result = model(input_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        _sequence_labels,
+        _token_labels,
+        _choice_labels,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = False
+        model = NemotronHForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 1), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_mamba2_slow_vs_fast_forward(self, config, input_ids, *args):
+        """
+        Test that cuda_kernels_forward and torch_forward produce consistent outputs.
+        This ensures that the optimized CUDA kernel path and the pure PyTorch path
+        are equivalent.
+        """
+        model = NemotronHModel(config)
+        model.eval()
+
+        if not (is_mamba_ssm_available() and is_causal_conv1d_available()):
+            self.parent.skipTest(
+                "This test needs the Mamba2 fast path. Skipping as the necessary packages have not been found."
+            )
+        if torch_device != "cuda":
+            self.parent.skipTest("This test needs the Mamba2 fast path. Skipping as we need a cuda capable device.")
+
+        model.to(torch_device)
+
+        # Get the first mamba layer for testing
+        # Find the index of the first mamba layer
+        mamba_layer_idx = None
+        for idx, layer_type in enumerate(config.layers_block_type):
+            if layer_type == "mamba":
+                mamba_layer_idx = idx
+                break
+
+        if mamba_layer_idx is None:
+            self.parent.skipTest("No mamba layer found in the model configuration.")
+
+        # Get embeddings
+        token_emb = model.embeddings(input_ids.to(torch_device))
+
+        # Get the mamba mixer from the first mamba block
+        mamba_mixer = model.layers[mamba_layer_idx].mixer
+
+        # Test without cache
+        outputs_fast = mamba_mixer.cuda_kernels_forward(token_emb)
+        outputs_slow = mamba_mixer.torch_forward(token_emb)
+
+        self.parent.assertTrue(torch.allclose(outputs_fast, outputs_slow, atol=1e-3, rtol=1e-3))
+
+        # Test with cache
+        cache_params = DynamicCache(config=config)
+        outputs_fast_cached = mamba_mixer.cuda_kernels_forward(token_emb, cache_params=cache_params)
+
+        # Reset cache for fair comparison
+        cache_params_slow = DynamicCache(config=config)
+        outputs_slow_cached = mamba_mixer.torch_forward(token_emb, cache_params=cache_params_slow)
+
+        self.parent.assertTrue(torch.allclose(outputs_fast_cached, outputs_slow_cached, atol=1e-3, rtol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class NemotronHModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            NemotronHModel,
+            NemotronHForCausalLM,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": NemotronHModel,
+            "text-generation": NemotronHForCausalLM,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    def _get_conv_state_shape(self, batch_size: int, config):
+        intermediate_size = config.mamba_num_heads * config.mamba_head_dim
+        conv_shape = (
+            batch_size,
+            intermediate_size + 2 * config.n_groups * config.ssm_state_size,
+            config.conv_kernel,
+        )
+        return conv_shape
+
+    def _get_recurrent_state_shape(self, batch_size: int, config):
+        return (batch_size, config.mamba_num_heads, config.mamba_head_dim, config.ssm_state_size)
+
+    def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config):
+        # Raise a useful error, asking to explicitly override the method
+        if not isinstance(past_key_values, DynamicCache):
+            raise ValueError("The cache does not use the correct Cache")
+
+        # Use the correct config
+        config = config.get_text_config(decoder=True)
+
+        # (batch, kv heads, seq_length, head_dim)
+        # Only pure mamba models do not have num_attention_heads defined in config, so it can never be 1 in practice for attention models
+        num_attention_heads = getattr(config, "num_attention_heads", 1)
+        num_kv_heads = getattr(config, "num_key_value_heads", num_attention_heads)
+        hidden_size = getattr(config, "d_model", config.hidden_size)
+        head_dim = getattr(config, "head_dim", hidden_size // num_attention_heads)
+
+        # For cross attention cache, the seq_length depends on the model, so we remove that dim
+        attention_shape = (batch_size, num_kv_heads, seq_length, head_dim)
+        # For mamba layers
+        conv_shape = self._get_conv_state_shape(batch_size, config)
+        recurrent_shape = self._get_recurrent_state_shape(batch_size, config)
+
+        # Check each layer has the correct shape
+        for layer, layer_type in zip(past_key_values.layers, config.layer_types):
+            # Moe layers have a default mamba cache instantiated, but it stays empty as the layer does not use it
+            if layer_type == "moe":
+                self.assertEqual(layer.conv_states, None)
+                self.assertEqual(layer.recurrent_states, None)
+            # Attention layer cache
+            elif layer_type == "attention":
+                self.assertEqual(layer.keys.shape, attention_shape)
+                self.assertEqual(layer.values.shape, attention_shape)
+            # Mamba layer cache
+            elif layer_type == "mamba":
+                self.assertEqual(layer.conv_states.shape, conv_shape)
+                self.assertEqual(layer.recurrent_states.shape, recurrent_shape)
+            else:
+                raise ValueError("Unknown layer type.")
+
+    def setUp(self):
+        self.model_tester = NemotronHModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=NemotronHConfig, common_properties=["hidden_size", "num_attention_heads"]
+        )
+        # Save original settings
+        self._original_deterministic = torch.are_deterministic_algorithms_enabled()
+        self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
+        self._original_cudnn_benchmark = torch.backends.cudnn.benchmark
+        # Apply deterministic settings for NemotronH tests
+        torch.use_deterministic_algorithms(True)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+    def tearDown(self):
+        # Restore original settings
+        torch.use_deterministic_algorithms(self._original_deterministic)
+        torch.backends.cudnn.deterministic = self._original_cudnn_deterministic
+        torch.backends.cudnn.benchmark = self._original_cudnn_benchmark
+
+    @unittest.skip(reason="NemotronH needs at least 3 layers to test (mamba, moe, attention)")
+    def test_num_layers_is_small(self):
+        pass
+
+    @unittest.skip("position_ids cannot be used to pad due to Mamba2 layers")
+    def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+    @unittest.skip(reason="NemotronH has hybrid cache.")
+    def test_generate_continue_from_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="A large nemotron3 would be necessary (and costly) for that")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    def test_reverse_loading_mapping(self):
+        super().test_reverse_loading_mapping(skip_base_model=True)
+
+    # TODO(liding):
+    # in test_configuration_common.py, three tests failed
+    # create_and_test_config_to_json_file
+    # create_and_test_config_from_and_save_pretrained
+    # create_and_test_config_from_and_save_pretrained_composite
+    # def test_config(self):
+    #     self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_mamba2_slow_vs_fast_forward(self):
+        """
+        Test that cuda_kernels_forward and torch_forward produce consistent outputs.
+        """
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mamba2_slow_vs_fast_forward(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        r"""
+        Overriding the test_attention_outputs test as the NemotronH model outputs attention only for its attention layers
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        # Count attention layers from hybrid pattern
+        num_attention_layers = config.hybrid_override_pattern.count("*")
+
+        for model_class in self.all_model_classes:
+            print(f"Testing model class: {model_class}")
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class._from_config(config, attn_implementation="eager")
+            config = model.config
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), num_attention_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), num_attention_layers)
+
+            if num_attention_layers > 0:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+            self.assertEqual(len(self_attentions), num_attention_layers)
+
+            if num_attention_layers > 0:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    @require_flash_attn
+    @require_torch_accelerator
+    @require_bitsandbytes
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_fp32_ln(self):
+        r"""
+        Overriding the test_flash_attn_2_fp32_ln test as the NemotronH model, like Zamba2, doesn't support
+        right padding + use cache with FA2
+        """
+        from transformers import BitsAndBytesConfig
+
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                dummy_input = inputs_dict[model.main_input_name]
+                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+                # NOTE: NemotronH does not support right padding + use_cache with FA2.
+                dummy_attention_mask[:, -1] = 1
+
+                model = model_class.from_pretrained(
+                    tmpdirname,
+                    dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+                )
+
+                for _, param in model.named_parameters():
+                    # upcast only layer norms
+                    if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
+                        param.data = param.data.to(torch.float32)
+
+                _ = model(dummy_input)
+                # with attention mask
+                _ = model(dummy_input, attention_mask=dummy_attention_mask)
+
+    @require_torch_accelerator
+    def test_flex_attention_with_grads(self):
+        """
+        Overwriting as the base hidden size is big enough for compile.
+        Manipulation of dims causes issues due to other constraints not being satisfied anymore.
+        """
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config._attn_implementation = "flex_attention"
+
+            model = model_class(config).to(device=torch_device)
+            self.assertTrue(model.config._attn_implementation == "flex_attention")
+
+            # Elaborate workaround for encoder-decoder models as some do not specify their main input
+            dummy_inputs = {model.main_input_name: inputs_dict[model.main_input_name].to(torch_device)}
+            if config.is_encoder_decoder:
+                dummy_inputs["decoder_input_ids"] = inputs_dict["decoder_input_ids"].to(torch_device)
+                dummy_inputs["decoder_attention_mask"] = inputs_dict["decoder_attention_mask"].to(torch_device)
+
+            # If this does not raise an error, the test passes (see https://github.com/huggingface/transformers/pull/35605)
+            _ = model(**dummy_inputs)
+
+    def test_layers_block_type_validation(self):
+        """Test that layers_block_type is validated correctly"""
+
+        # Valid list - should work
+        config = NemotronHConfig(
+            vocab_size=100, hidden_size=32, layers_block_type=["mamba", "moe", "attention", "moe"]
+        )
+        self.assertEqual(len(config.layers_block_type), 4)
+        self.assertEqual(config.num_hidden_layers, 4)
+
+        # Invalid layer type - should raise error
+        with self.assertRaises(StrictDataclassClassValidationError):
+            NemotronHConfig(
+                vocab_size=100,
+                hidden_size=32,
+                layers_block_type=["mamba", "moe", "attention", "invalid"],  # "invalid" is not valid
+            )
+
+    def test_layers_block_type(self):
+        """Test that layers_block_type works correctly and backward compatibility"""
+        # Create config with explicit list
+        config = NemotronHConfig(
+            vocab_size=100, hidden_size=32, layers_block_type=["mamba", "moe", "attention", "moe"]
+        )
+
+        # Test direct access to layers_block_type
+        self.assertEqual(config.layers_block_type[0], "mamba")
+        self.assertEqual(config.layers_block_type[1], "moe")
+        self.assertEqual(config.layers_block_type[2], "attention")
+        self.assertEqual(config.layers_block_type[3], "moe")
+
+        # Test that num_hidden_layers is derived from layers_block_type length
+        self.assertEqual(config.num_hidden_layers, 4)
+
+        # Test backward compatibility - hybrid_override_pattern property
+        self.assertEqual(config.hybrid_override_pattern, "ME*E")
+
+        # Test the model tester config
+        config2 = self.model_tester.get_config()
+        self.assertEqual(len(config2.layers_block_type), 5)
+        self.assertEqual(config2.layers_block_type[0], "mamba")
+        self.assertEqual(config2.layers_block_type[1], "moe")
+        self.assertEqual(config2.layers_block_type[2], "mamba")
+        self.assertEqual(config2.layers_block_type[3], "attention")
+        self.assertEqual(config2.layers_block_type[4], "moe")
+
+    def test_generate_with_and_without_cache(self):
+        """Test that generation with and without cache produces identical outputs"""
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+
+        # Create model
+        model = NemotronHForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # Create input for generation (smaller sequence for faster test)
+        input_ids = ids_tensor([1, 5], config.vocab_size)  # batch_size=1, seq_len=5
+        input_ids = input_ids.to(torch_device)
+
+        # Set seed for reproducibility
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        # Generate with cache
+        with torch.no_grad():
+            print("running generate with cache")
+            output_with_cache = model.generate(
+                input_ids,
+                max_new_tokens=5,
+                do_sample=False,
+                use_cache=True,
+            )
+
+        # Reset seed
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        # Generate without cache
+        with torch.no_grad():
+            print("running generate without cache")
+            output_without_cache = model.generate(
+                input_ids,
+                max_new_tokens=5,
+                do_sample=False,
+                use_cache=False,
+            )
+
+        print(f"output_with_cache: {output_with_cache}")
+        print(f"output_without_cache: {output_without_cache}")
+
+        # Outputs should be identical
+        self.assertTrue(
+            torch.equal(output_with_cache, output_without_cache),
+            msg=f"Outputs differ:\n  With cache: {output_with_cache}\n  Without cache: {output_without_cache}",
+        )
+
+    def test_legacy_hybrid_override_pattern(self):
+        """Test backward compatibility with legacy hybrid_override_pattern"""
+        # Create config using legacy hybrid_override_pattern
+        config = NemotronHConfig(vocab_size=100, hidden_size=32, hybrid_override_pattern="ME*E")
+
+        # Test that it's converted to layers_block_type
+        self.assertEqual(config.layers_block_type, ["mamba", "moe", "attention", "moe"])
+        self.assertEqual(config.num_hidden_layers, 4)
+        self.assertEqual(config.hybrid_override_pattern, "ME*E")
+
+        # Test with longer pattern
+        config2 = NemotronHConfig(vocab_size=100, hidden_size=32, hybrid_override_pattern="MEME*EME")
+        self.assertEqual(
+            config2.layers_block_type, ["mamba", "moe", "mamba", "moe", "attention", "moe", "mamba", "moe"]
+        )
+        self.assertEqual(config2.num_hidden_layers, 8)
+
+    def test_num_hidden_layers_deprecated(self):
+        """Test that num_hidden_layers is now derived from layers_block_type length"""
+        # Test that num_hidden_layers is derived from layers_block_type
+        config = NemotronHConfig(layers_block_type=["mamba", "moe", "attention", "moe", "mamba", "attention"])
+        self.assertEqual(config.num_hidden_layers, 6)
+
+        # Test that num_hidden_layers parameter is ignored when layers_block_type is provided
+        config2 = NemotronHConfig(
+            layers_block_type=["mamba", "moe", "attention"],
+            num_hidden_layers=10,  # This should be ignored
+        )
+        # Should use layers_block_type length, not the parameter
+        self.assertEqual(config2.num_hidden_layers, 3)
+
+    def test_legacy_config_json_loading(self):
+        """Test loading legacy config.json with hybrid_override_pattern and num_hidden_layers"""
+        import json
+
+        # Create a legacy config.json
+        legacy_config = {
+            "model_type": "nemotron_3",
+            "vocab_size": 100,
+            "hidden_size": 32,
+            "num_hidden_layers": 6,
+            "hybrid_override_pattern": "MEME*E",
+            "num_attention_heads": 4,
+        }
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_path = f"{tmpdir}/config.json"
+            with open(config_path, "w") as f:
+                json.dump(legacy_config, f)
+
+            # Load the config
+            config = NemotronHConfig.from_json_file(config_path)
+
+            # Verify conversion
+            self.assertEqual(len(config.layers_block_type), 6)
+            self.assertEqual(config.num_hidden_layers, 6)
+            self.assertEqual(config.layers_block_type, ["mamba", "moe", "mamba", "moe", "attention", "moe"])
+            self.assertEqual(config.hybrid_override_pattern, "MEME*E")
+
+    def test_mtp_backward_compatibility(self):
+        """Test MTP backward compatibility with mtp_hybrid_override_pattern"""
+        config = NemotronHConfig(
+            layers_block_type=["mamba", "moe", "attention", "moe"],
+            num_nextn_predict_layers=2,
+            mtp_hybrid_override_pattern="*E",
+        )
+
+        # Verify conversion
+        self.assertEqual(config.mtp_layers_block_type, ["attention", "moe"])
+        self.assertEqual(config.mtp_hybrid_override_pattern, "*E")
+
+    def test_config_roundtrip_save_load(self):
+        """Test that config can be saved and loaded correctly"""
+        # Create config with new format
+        config1 = NemotronHConfig(
+            vocab_size=100, hidden_size=32, layers_block_type=["mamba", "attention", "moe", "attention"]
+        )
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Save
+            config1.save_pretrained(tmpdir)
+
+            # Load
+            config2 = NemotronHConfig.from_pretrained(tmpdir)
+
+            # Verify
+            self.assertEqual(config2.layers_block_type, ["mamba", "attention", "moe", "attention"])
+            self.assertEqual(config2.num_hidden_layers, 4)
+            self.assertEqual(config2.vocab_size, 100)
+            self.assertEqual(config2.hidden_size, 32)
+
+    def test_pattern_conversion_methods(self):
+        """Test the pattern conversion utility methods"""
+        # Test _pattern_to_list
+        pattern = "M*EME*"
+        layers_list = NemotronHConfig._pattern_to_list(pattern)
+        self.assertEqual(layers_list, ["mamba", "attention", "moe", "mamba", "moe", "attention"])
+
+        # Test _list_to_pattern
+        layers_list = ["mamba", "moe", "attention", "moe"]
+        pattern = NemotronHConfig._list_to_pattern(layers_list)
+        self.assertEqual(pattern, "ME*E")
+
+        # Test roundtrip
+        original_pattern = "ME*ME*E"
+        roundtrip_pattern = NemotronHConfig._list_to_pattern(NemotronHConfig._pattern_to_list(original_pattern))
+        self.assertEqual(original_pattern, roundtrip_pattern)
+
+
+@require_torch
+class NemotronHModelIntegrationTest(unittest.TestCase):
+    model = None
+    tokenizer = None
+
+    @classmethod
+    @slow
+    def setUpClass(cls):
+        model_id = "dmax123/tiny-nemotron-dummy-weights"
+        revision = "081dbac3061bb16c0c458c1798b1d9d7bc135c95"
+        cls.model = NemotronHForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, revision=revision)
+        cls.tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+
+    def setUp(self):
+        # Save original settings
+        self._original_deterministic = torch.are_deterministic_algorithms_enabled()
+        self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
+        self._original_cudnn_benchmark = torch.backends.cudnn.benchmark
+        # Apply deterministic settings for NemotronH tests
+        torch.use_deterministic_algorithms(True)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+    def tearDown(self):
+        # Restore original settings
+        torch.use_deterministic_algorithms(self._original_deterministic)
+        torch.backends.cudnn.deterministic = self._original_cudnn_deterministic
+        torch.backends.cudnn.benchmark = self._original_cudnn_benchmark
+
+    @slow
+    def test_simple_generate(self):
+        self.model.to(torch_device)
+
+        prompt = "Hey how are you doing?"
+        EXPECTED_TOKENS_IDS = torch.tensor(
+            [1045, 1429, 1073, 4525, 1605, 1261, 4249, 1044, 2081, 2224], dtype=torch.int32
+        )
+
+        messages = [{"role": "user", "content": prompt}]
+        tokenized_chat = self.tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+        )
+        input_ids = tokenized_chat["input_ids"].to(torch_device)
+        prompt_length = input_ids.shape[1]
+
+        outputs = self.model.generate(input_ids, do_sample=False, max_new_tokens=10)
+
+        generated_tokens = outputs[0][prompt_length:]
+        self.assertTrue(torch.equal(generated_tokens.cpu(), EXPECTED_TOKENS_IDS.cpu()))