first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/qwen3_next/test_modeling_qwen3_next.py
+++ b/tests/models/qwen3_next/test_modeling_qwen3_next.py
@@ -0,0 +1,294 @@
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+from parameterized import parameterized
+
+from transformers import DataCollatorWithFlattening, is_torch_available
+from transformers.testing_utils import (
+    require_causal_conv1d,
+    require_flash_linear_attention,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+    torch_device,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DynamicCache,
+        Qwen3NextForCausalLM,
+        Qwen3NextModel,
+    )
+
+from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    _test_eager_matches_sdpa_inference,
+    ids_tensor,
+)
+
+
+class Qwen3NextModelTester(CausalLMModelTester):
+    if is_torch_available():
+        base_model_class = Qwen3NextModel
+
+    def __init__(self, parent):
+        super().__init__(parent=parent)
+        # NOTE(3outeille): must be 0.0 for TP backward tests. In train mode, non-zero dropout causes
+        # different RNG states between the non-TP and TP model forward passes (they run sequentially),
+        # leading to different dropout masks and mismatched losses.
+        self.attention_probs_dropout_prob = 0.0
+        self.hidden_act = "silu"
+        self.layer_types = ["linear_attention", "full_attention"]
+        self.linear_conv_kernel_dim = 2
+        self.linear_key_head_dim = 16
+        self.linear_value_head_dim = 16
+        self.linear_num_key_heads = 4
+        self.linear_num_value_heads = 8
+
+
+@require_torch
+class Qwen3NextModelTest(CausalLMModelTest, unittest.TestCase):
+    model_tester_class = Qwen3NextModelTester
+
+    def _get_conv_state_shape(self, batch_size: int, config):
+        num_v_heads = config.linear_num_value_heads
+        num_k_heads = config.linear_num_key_heads
+        head_k_dim = config.linear_key_head_dim
+        head_v_dim = config.linear_value_head_dim
+        intermediate_size = 2 * num_k_heads * head_k_dim + num_v_heads * head_v_dim
+
+        return (batch_size, intermediate_size, config.linear_conv_kernel_dim)
+
+    def _get_recurrent_state_shape(self, batch_size: int, config):
+        num_v_heads = config.linear_num_value_heads
+        head_k_dim = config.linear_key_head_dim
+        head_v_dim = config.linear_value_head_dim
+
+        return (batch_size, num_v_heads, head_k_dim, head_v_dim)
+
+    @unittest.skip("Qwen3-Next hybrid linear-attention cache is not compatible with quantized cache yet.")
+    def test_generate_with_quant_cache(self):
+        pass
+
+    def test_attention_outputs(self):
+        "Needs to be overwritten as Qwen3-Next alternates between attention layers and gated deltanet layers."
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        # force eager attention to support output attentions
+        config._attn_implementation = "eager"
+        seq_len = getattr(self.model_tester, "seq_length", None)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class._from_config(config, attn_implementation="eager")
+            config = model.config
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), sum(layer == "full_attention" for layer in config.layer_types))
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), sum(layer == "full_attention" for layer in config.layer_types))
+            self.assertListEqual(list(attentions[0].shape[-3:]), [config.num_attention_heads, seq_len, seq_len])
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+                self_attentions = outputs.attentions
+
+            self.assertEqual(out_len + 1, len(outputs))
+            self.assertEqual(len(self_attentions), sum(layer == "full_attention" for layer in config.layer_types))
+            self.assertListEqual(list(self_attentions[0].shape[-3:]), [config.num_attention_heads, seq_len, seq_len])
+
+    def test_linear_attention_multi_token_cached_forward_matches_single_token(self):
+        """
+        Qwen3-Next's gated-delta-net layers must produce the same output for a token regardless of
+        whether it's fed as a single-token cached forward or as the first token of a multi-token chunk
+        after the cache has been populated (chunked-prefill continuation / speculative verification).
+        A causal LM's logits at position `i` cannot depend on tokens at positions > `i`, even across
+        separate forward calls with a shared cache.
+        """
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        config._attn_implementation = "eager"
+        # GatedDeltaNet's fused norm-gate kernel only supports silu/swish/sigmoid; the shared tester
+        # default `gelu` would raise before exercising the cache path.
+        config.hidden_act = "silu"
+        model = Qwen3NextModel._from_config(config)
+        model.to(torch_device)
+        model.eval()
+
+        prefill_len = 8
+        prompt = ids_tensor((1, prefill_len), config.vocab_size).to(torch_device)
+        next_token = ids_tensor((1, 1), config.vocab_size).to(torch_device)
+
+        # Reference: prefill, then forward the next token alone with the populated cache.
+        cache_single = DynamicCache(config=config)
+        with torch.no_grad():
+            model(input_ids=prompt, past_key_values=cache_single, use_cache=True)
+            single_out = model(input_ids=next_token, past_key_values=cache_single, use_cache=True)
+        ref_first = single_out.last_hidden_state[:, 0, :]
+
+        # Under test: prefill, then forward [next_token, *distractors] in one call. The first
+        # position must match the single-token forward exactly (causal attention).
+        distractors = ids_tensor((1, 7), config.vocab_size).to(torch_device)
+        multi_input = torch.cat([next_token, distractors], dim=1)
+        cache_multi = DynamicCache(config=config)
+        with torch.no_grad():
+            model(input_ids=prompt, past_key_values=cache_multi, use_cache=True)
+            multi_out = model(input_ids=multi_input, past_key_values=cache_multi, use_cache=True)
+        under_test_first = multi_out.last_hidden_state[:, 0, :]
+
+        torch.testing.assert_close(under_test_first, ref_first, rtol=1e-4, atol=1e-4)
+
+    @require_causal_conv1d
+    @require_flash_linear_attention
+    @require_torch_gpu
+    def test_padding_free_matches_padded_fast_path_regression(self):
+        torch.manual_seed(0)
+        config = self.model_tester.get_config()
+        model = Qwen3NextForCausalLM(config).to(torch_device).eval()
+
+        data_collator = DataCollatorWithFlattening(
+            return_tensors="pt", return_seq_idx=True, return_flash_attn_kwargs=True
+        )
+        test_cases = [
+            (
+                torch.tensor([[0, 0, 0, 1, 2, 3], [0, 0, 0, 0, 4, 5]], device=torch_device),
+                torch.tensor([[0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1]], dtype=torch.long, device=torch_device),
+                [{"input_ids": [1, 2, 3]}, {"input_ids": [4, 5]}],
+            ),
+            (
+                torch.tensor([[0, 1, 2, 3, 4, 5], [0, 0, 0, 0, 0, 6]], device=torch_device),
+                torch.tensor([[0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1]], dtype=torch.long, device=torch_device),
+                [{"input_ids": [1, 2, 3, 4, 5]}, {"input_ids": [6]}],
+            ),
+        ]
+
+        for padded_input_ids, attention_mask, features in test_cases:
+            position_ids = ((attention_mask == 1).long().cumsum(dim=1) - 1) * (attention_mask == 1).long()
+            padding_free_batch = data_collator(features)
+            padding_free_batch = {
+                key: value.to(torch_device) if torch.is_tensor(value) else value
+                for key, value in padding_free_batch.items()
+            }
+
+            with torch.no_grad():
+                res_padded = model(
+                    input_ids=padded_input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    use_cache=False,
+                )
+                res_padfree = model(**padding_free_batch, use_cache=False)
+
+            logits_padded = res_padded.logits[attention_mask.bool()]
+            logits_padfree = res_padfree.logits[0]
+
+            torch.testing.assert_close(logits_padded, logits_padfree, atol=1e-5, rtol=1e-5)
+
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    def test_eager_matches_sdpa_inference(
+        self,
+        name,
+        dtype,
+        padding_side,
+        use_attention_mask,
+        output_attentions,
+        enable_kernels,
+    ):
+        """
+        We need to overwrite this without the fp16 part of the dtype, because the slow path `torch_chunk_gated_delta_rule`
+        is not robust enough (flaky test) in fp16 due to upscaling in fp32 and then downscaling to fp16 at the end
+        """
+        if dtype == "fp16":
+            self.skipTest("Not robust in fp16")
+        _test_eager_matches_sdpa_inference(
+            self,
+            name,
+            dtype,
+            padding_side,
+            use_attention_mask,
+            output_attentions,
+            enable_kernels,
+        )
+
+    @unittest.skip("The specific cache format cannot be instantiated from dp/ddp data.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @require_torch_multi_gpu
+    def test_can_use_device_map(self):
+        """
+        Test that this model can be dispatched on multiple gpus. It's not obvious as the Cache is not standard,
+        ant each layer need to use the correct device on which it reside (i.e. it needs to be lazy initialized).
+        """
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            inputs_dict = {k: v.to(0) if isinstance(v, torch.Tensor) else v for k, v in inputs_dict.items()}
+            # We want the linear attention layer to reside on device 1 with the device map (i.e. not the first/default device),
+            # to check if cache initialization is on the correct device
+            config.layer_types = ["full_attention", "linear_attention"]
+            model = model_class(config).eval()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                del model
+                model = model_class.from_pretrained(
+                    tmpdirname,
+                    device_map={
+                        "lm_head": 0,
+                        "model.embed_tokens": 0,
+                        "model.norm": 0,
+                        "model.layers.0": 0,
+                        "model.layers.1": 1,
+                    },
+                )
+
+                # Check that we indeed use 2 different devices for each layer
+                self.assertTrue({param.device for param in model.model.layers[0].parameters()} == {torch.device(0)})
+                self.assertTrue({param.device for param in model.model.layers[1].parameters()} == {torch.device(1)})
+
+                # This should not crash
+                _ = model.generate(**inputs_dict, max_new_tokens=5, min_new_tokens=5)
+
+
+@slow
+class Qwen3NextIntegrationTest(unittest.TestCase):
+    pass