Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
295 lines
13 KiB
Python
295 lines
13 KiB
Python
# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import tempfile
|
|
import unittest
|
|
|
|
from parameterized import parameterized
|
|
|
|
from transformers import DataCollatorWithFlattening, is_torch_available
|
|
from transformers.testing_utils import (
|
|
require_causal_conv1d,
|
|
require_flash_linear_attention,
|
|
require_torch,
|
|
require_torch_gpu,
|
|
require_torch_multi_gpu,
|
|
slow,
|
|
torch_device,
|
|
)
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
from transformers import (
|
|
DynamicCache,
|
|
Qwen3NextForCausalLM,
|
|
Qwen3NextModel,
|
|
)
|
|
|
|
from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
|
|
from ...test_modeling_common import (
|
|
TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
|
|
_test_eager_matches_sdpa_inference,
|
|
ids_tensor,
|
|
)
|
|
|
|
|
|
class Qwen3NextModelTester(CausalLMModelTester):
|
|
if is_torch_available():
|
|
base_model_class = Qwen3NextModel
|
|
|
|
def __init__(self, parent):
|
|
super().__init__(parent=parent)
|
|
# NOTE(3outeille): must be 0.0 for TP backward tests. In train mode, non-zero dropout causes
|
|
# different RNG states between the non-TP and TP model forward passes (they run sequentially),
|
|
# leading to different dropout masks and mismatched losses.
|
|
self.attention_probs_dropout_prob = 0.0
|
|
self.hidden_act = "silu"
|
|
self.layer_types = ["linear_attention", "full_attention"]
|
|
self.linear_conv_kernel_dim = 2
|
|
self.linear_key_head_dim = 16
|
|
self.linear_value_head_dim = 16
|
|
self.linear_num_key_heads = 4
|
|
self.linear_num_value_heads = 8
|
|
|
|
|
|
@require_torch
|
|
class Qwen3NextModelTest(CausalLMModelTest, unittest.TestCase):
|
|
model_tester_class = Qwen3NextModelTester
|
|
|
|
def _get_conv_state_shape(self, batch_size: int, config):
|
|
num_v_heads = config.linear_num_value_heads
|
|
num_k_heads = config.linear_num_key_heads
|
|
head_k_dim = config.linear_key_head_dim
|
|
head_v_dim = config.linear_value_head_dim
|
|
intermediate_size = 2 * num_k_heads * head_k_dim + num_v_heads * head_v_dim
|
|
|
|
return (batch_size, intermediate_size, config.linear_conv_kernel_dim)
|
|
|
|
def _get_recurrent_state_shape(self, batch_size: int, config):
|
|
num_v_heads = config.linear_num_value_heads
|
|
head_k_dim = config.linear_key_head_dim
|
|
head_v_dim = config.linear_value_head_dim
|
|
|
|
return (batch_size, num_v_heads, head_k_dim, head_v_dim)
|
|
|
|
@unittest.skip("Qwen3-Next hybrid linear-attention cache is not compatible with quantized cache yet.")
|
|
def test_generate_with_quant_cache(self):
|
|
pass
|
|
|
|
def test_attention_outputs(self):
|
|
"Needs to be overwritten as Qwen3-Next alternates between attention layers and gated deltanet layers."
|
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
|
config.return_dict = True
|
|
# force eager attention to support output attentions
|
|
config._attn_implementation = "eager"
|
|
seq_len = getattr(self.model_tester, "seq_length", None)
|
|
|
|
for model_class in self.all_model_classes:
|
|
inputs_dict["output_attentions"] = True
|
|
inputs_dict["output_hidden_states"] = False
|
|
config.return_dict = True
|
|
model = model_class._from_config(config, attn_implementation="eager")
|
|
config = model.config
|
|
model.to(torch_device)
|
|
model.eval()
|
|
with torch.no_grad():
|
|
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
|
attentions = outputs.attentions
|
|
self.assertEqual(len(attentions), sum(layer == "full_attention" for layer in config.layer_types))
|
|
|
|
# check that output_attentions also work using config
|
|
del inputs_dict["output_attentions"]
|
|
config.output_attentions = True
|
|
model = model_class(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
with torch.no_grad():
|
|
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
|
attentions = outputs.attentions
|
|
self.assertEqual(len(attentions), sum(layer == "full_attention" for layer in config.layer_types))
|
|
self.assertListEqual(list(attentions[0].shape[-3:]), [config.num_attention_heads, seq_len, seq_len])
|
|
out_len = len(outputs)
|
|
|
|
# Check attention is always last and order is fine
|
|
inputs_dict["output_attentions"] = True
|
|
inputs_dict["output_hidden_states"] = True
|
|
model = model_class(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
with torch.no_grad():
|
|
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
|
self_attentions = outputs.attentions
|
|
|
|
self.assertEqual(out_len + 1, len(outputs))
|
|
self.assertEqual(len(self_attentions), sum(layer == "full_attention" for layer in config.layer_types))
|
|
self.assertListEqual(list(self_attentions[0].shape[-3:]), [config.num_attention_heads, seq_len, seq_len])
|
|
|
|
def test_linear_attention_multi_token_cached_forward_matches_single_token(self):
|
|
"""
|
|
Qwen3-Next's gated-delta-net layers must produce the same output for a token regardless of
|
|
whether it's fed as a single-token cached forward or as the first token of a multi-token chunk
|
|
after the cache has been populated (chunked-prefill continuation / speculative verification).
|
|
A causal LM's logits at position `i` cannot depend on tokens at positions > `i`, even across
|
|
separate forward calls with a shared cache.
|
|
"""
|
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
|
config._attn_implementation = "eager"
|
|
# GatedDeltaNet's fused norm-gate kernel only supports silu/swish/sigmoid; the shared tester
|
|
# default `gelu` would raise before exercising the cache path.
|
|
config.hidden_act = "silu"
|
|
model = Qwen3NextModel._from_config(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
|
|
prefill_len = 8
|
|
prompt = ids_tensor((1, prefill_len), config.vocab_size).to(torch_device)
|
|
next_token = ids_tensor((1, 1), config.vocab_size).to(torch_device)
|
|
|
|
# Reference: prefill, then forward the next token alone with the populated cache.
|
|
cache_single = DynamicCache(config=config)
|
|
with torch.no_grad():
|
|
model(input_ids=prompt, past_key_values=cache_single, use_cache=True)
|
|
single_out = model(input_ids=next_token, past_key_values=cache_single, use_cache=True)
|
|
ref_first = single_out.last_hidden_state[:, 0, :]
|
|
|
|
# Under test: prefill, then forward [next_token, *distractors] in one call. The first
|
|
# position must match the single-token forward exactly (causal attention).
|
|
distractors = ids_tensor((1, 7), config.vocab_size).to(torch_device)
|
|
multi_input = torch.cat([next_token, distractors], dim=1)
|
|
cache_multi = DynamicCache(config=config)
|
|
with torch.no_grad():
|
|
model(input_ids=prompt, past_key_values=cache_multi, use_cache=True)
|
|
multi_out = model(input_ids=multi_input, past_key_values=cache_multi, use_cache=True)
|
|
under_test_first = multi_out.last_hidden_state[:, 0, :]
|
|
|
|
torch.testing.assert_close(under_test_first, ref_first, rtol=1e-4, atol=1e-4)
|
|
|
|
@require_causal_conv1d
|
|
@require_flash_linear_attention
|
|
@require_torch_gpu
|
|
def test_padding_free_matches_padded_fast_path_regression(self):
|
|
torch.manual_seed(0)
|
|
config = self.model_tester.get_config()
|
|
model = Qwen3NextForCausalLM(config).to(torch_device).eval()
|
|
|
|
data_collator = DataCollatorWithFlattening(
|
|
return_tensors="pt", return_seq_idx=True, return_flash_attn_kwargs=True
|
|
)
|
|
test_cases = [
|
|
(
|
|
torch.tensor([[0, 0, 0, 1, 2, 3], [0, 0, 0, 0, 4, 5]], device=torch_device),
|
|
torch.tensor([[0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1]], dtype=torch.long, device=torch_device),
|
|
[{"input_ids": [1, 2, 3]}, {"input_ids": [4, 5]}],
|
|
),
|
|
(
|
|
torch.tensor([[0, 1, 2, 3, 4, 5], [0, 0, 0, 0, 0, 6]], device=torch_device),
|
|
torch.tensor([[0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1]], dtype=torch.long, device=torch_device),
|
|
[{"input_ids": [1, 2, 3, 4, 5]}, {"input_ids": [6]}],
|
|
),
|
|
]
|
|
|
|
for padded_input_ids, attention_mask, features in test_cases:
|
|
position_ids = ((attention_mask == 1).long().cumsum(dim=1) - 1) * (attention_mask == 1).long()
|
|
padding_free_batch = data_collator(features)
|
|
padding_free_batch = {
|
|
key: value.to(torch_device) if torch.is_tensor(value) else value
|
|
for key, value in padding_free_batch.items()
|
|
}
|
|
|
|
with torch.no_grad():
|
|
res_padded = model(
|
|
input_ids=padded_input_ids,
|
|
attention_mask=attention_mask,
|
|
position_ids=position_ids,
|
|
use_cache=False,
|
|
)
|
|
res_padfree = model(**padding_free_batch, use_cache=False)
|
|
|
|
logits_padded = res_padded.logits[attention_mask.bool()]
|
|
logits_padfree = res_padfree.logits[0]
|
|
|
|
torch.testing.assert_close(logits_padded, logits_padfree, atol=1e-5, rtol=1e-5)
|
|
|
|
@parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
|
|
def test_eager_matches_sdpa_inference(
|
|
self,
|
|
name,
|
|
dtype,
|
|
padding_side,
|
|
use_attention_mask,
|
|
output_attentions,
|
|
enable_kernels,
|
|
):
|
|
"""
|
|
We need to overwrite this without the fp16 part of the dtype, because the slow path `torch_chunk_gated_delta_rule`
|
|
is not robust enough (flaky test) in fp16 due to upscaling in fp32 and then downscaling to fp16 at the end
|
|
"""
|
|
if dtype == "fp16":
|
|
self.skipTest("Not robust in fp16")
|
|
_test_eager_matches_sdpa_inference(
|
|
self,
|
|
name,
|
|
dtype,
|
|
padding_side,
|
|
use_attention_mask,
|
|
output_attentions,
|
|
enable_kernels,
|
|
)
|
|
|
|
@unittest.skip("The specific cache format cannot be instantiated from dp/ddp data.")
|
|
def test_multi_gpu_data_parallel_forward(self):
|
|
pass
|
|
|
|
@require_torch_multi_gpu
|
|
def test_can_use_device_map(self):
|
|
"""
|
|
Test that this model can be dispatched on multiple gpus. It's not obvious as the Cache is not standard,
|
|
ant each layer need to use the correct device on which it reside (i.e. it needs to be lazy initialized).
|
|
"""
|
|
for model_class in self.all_generative_model_classes:
|
|
config, inputs_dict = self.prepare_config_and_inputs_for_generate()
|
|
inputs_dict = {k: v.to(0) if isinstance(v, torch.Tensor) else v for k, v in inputs_dict.items()}
|
|
# We want the linear attention layer to reside on device 1 with the device map (i.e. not the first/default device),
|
|
# to check if cache initialization is on the correct device
|
|
config.layer_types = ["full_attention", "linear_attention"]
|
|
model = model_class(config).eval()
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
model.save_pretrained(tmpdirname)
|
|
del model
|
|
model = model_class.from_pretrained(
|
|
tmpdirname,
|
|
device_map={
|
|
"lm_head": 0,
|
|
"model.embed_tokens": 0,
|
|
"model.norm": 0,
|
|
"model.layers.0": 0,
|
|
"model.layers.1": 1,
|
|
},
|
|
)
|
|
|
|
# Check that we indeed use 2 different devices for each layer
|
|
self.assertTrue({param.device for param in model.model.layers[0].parameters()} == {torch.device(0)})
|
|
self.assertTrue({param.device for param in model.model.layers[1].parameters()} == {torch.device(1)})
|
|
|
|
# This should not crash
|
|
_ = model.generate(**inputs_dict, max_new_tokens=5, min_new_tokens=5)
|
|
|
|
|
|
@slow
|
|
class Qwen3NextIntegrationTest(unittest.TestCase):
|
|
pass
|