# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import subprocess import tempfile import unittest from parameterized import parameterized from transformers import is_torch_available from transformers.testing_utils import ( require_cuda_capability_at_least, require_torch, require_torch_accelerator, require_torch_large_accelerator, require_torch_n_accelerators, slow, torch_device, ) if is_torch_available(): import torch from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, DeepseekV4Model, FineGrainedFP8Config, ) from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester def _v4_chat(prompt: str) -> str: """V4-Flash chat-mode template (canonical form in ``encoding/encoding_dsv4.py`` on the model repo). ```` after ``<|Assistant|>`` skips the reasoning block and goes straight to the answer. Pair with ``add_special_tokens=False`` when tokenizing — the literal BOS is already in the template.""" return f"<|begin▁of▁sentence|><|User|>{prompt}<|Assistant|>" class DeepseekV4ModelTester(CausalLMModelTester): if is_torch_available(): base_model_class = DeepseekV4Model def __init__(self, parent, **kwargs): super().__init__(parent, **kwargs) # Standard CausalLMModelTester knobs — override the parent's positional defaults. self.hidden_size = 64 self.num_attention_heads = 4 self.num_key_value_heads = 1 self.num_hidden_layers = 2 self.num_experts_per_tok = 2 self.moe_intermediate_size = 64 self.max_position_embeddings = 64 # V4-only knobs. self.head_dim = 32 self.partial_rotary_factor = 8 / 32 # qk_rope_head_dim=8 / head_dim=32 self.q_lora_rank = 32 self.o_groups = 2 self.o_lora_rank = 16 self.n_routed_experts = 4 self.n_shared_experts = 1 # All "moe" (no "hash_moe") so inputs_embeds-only generation tests in # CausalLMModelTest exercise the model without hitting the hash router's # input_ids requirement. A dedicated test covers the hash path. self.mlp_layer_types = ["moe", "moe"] self.layer_types = ["heavily_compressed_attention", "compressed_sparse_attention"] self.sliding_window = 8 self.hc_mult = 2 self.hc_sinkhorn_iters = 3 self.hc_eps = 1.0e-6 self.index_n_heads = 2 self.index_head_dim = 16 self.index_topk = 2 self.num_nextn_predict_layers = 0 self.scoring_func = "sqrtsoftplus" self.routed_scaling_factor = 1.5 self.swiglu_limit = 10.0 self.rope_theta = 10000.0 self.compress_rope_theta = 160000.0 self.attention_bias = False self.attention_dropout = 0.0 @require_torch class DeepseekV4ModelTest(CausalLMModelTest, unittest.TestCase): model_tester_class = DeepseekV4ModelTester # Indexer parameters only influence the argmax over compressed positions (``topk``), # which is non-differentiable — their gradients flow through a separate objective in # the upstream training recipe, not the main causal-LM loss. test_all_params_have_gradient = False # No SequenceClassification / TokenClassification / QA heads on V4. def is_pipeline_test_to_skip(self, *args, **kwargs): return True @unittest.skip( "V4's `DeepseekV4GroupedLinear` uses `torch.bmm` for the per-group matmul; " "torchao's Float8Tensor only fast-paths `F.linear` (bmm needs the optional `mslk` " "kernel) so the quantized-TP path fails. A custom V4 FP8 path will land later." ) def test_tp_generation_quantized(self): pass def _check_attentions_for_generate( self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values ): # V4 layers with a Compressor attend to extra pooled positions, so the KV # length varies per layer. We only check the shape invariants: batched, same # number-of-heads and query-length; the KV-length axis may differ across layers. import torch # noqa: PLC0415 self.assertIsInstance(attentions, tuple) self.assertEqual(len(attentions), (output_length - prompt_length)) for _, iter_attentions in enumerate(attentions): self.assertIsInstance(iter_attentions, tuple) for layer_attention in iter_attentions: self.assertIsInstance(layer_attention, torch.Tensor) self.assertEqual(layer_attention.shape[0], batch_size) self.assertEqual(layer_attention.shape[1], config.num_attention_heads) @unittest.skip( "V4's rotary uses per-layer-type inv_freq buffers (Gemma3 pattern); the common test calls forward without `layer_type` and reads `.inv_freq`, neither of which apply." ) def test_model_rope_scaling_frequencies(self): pass @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) @unittest.skip( "V4's rotary uses per-layer-type rope_parameters; the common test sets a flat dict and skips for multi-layer-type rotaries." ) def test_model_rope_scaling_from_config(self, scaling_type): pass def test_hidden_states_output(self): # V4 layers emit a 4D ``[B, S, hc_mult, hidden]`` tensor — the hc_mult streams # are only collapsed at the top of the model via ``hc_head``. The common # ``test_hidden_states_output`` assumes ``(batch, seq, hidden)``; we re-run the # same check but accept the extra HC axis, and we additionally assert the final # (post-hc_head) ``last_hidden_state`` has the standard 3D shape. import torch # noqa: PLC0415 config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = True for model_class in self.all_model_classes: model = model_class(config).to(torch_device).eval() with torch.no_grad(): outputs = model(**inputs_dict) hidden_states = outputs.hidden_states if hasattr(outputs, "hidden_states") else outputs[-1] self.assertIsNotNone(hidden_states) self.assertEqual(len(hidden_states), config.num_hidden_layers + 1) seq_len = inputs_dict["input_ids"].shape[1] for layer_h in hidden_states: # Accept either the collapsed (3D) post-head shape or the per-layer 4D shape. if layer_h.ndim == 3: self.assertEqual(layer_h.shape, (inputs_dict["input_ids"].shape[0], seq_len, config.hidden_size)) else: self.assertEqual( layer_h.shape, (inputs_dict["input_ids"].shape[0], seq_len, config.hc_mult, config.hidden_size), ) def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config): # Every V4 layer is sliding-window, so the cache is length-bounded to # ``sliding_window`` instead of the full ``seq_length`` the parent tester expects. # We also accept the compressed-segment positions that ``DeepseekV4Attention`` # appends on compress layers (they live beyond the window on the keys axis). import torch # noqa: PLC0415 num_kv_heads = getattr(config, "num_key_value_heads", config.num_attention_heads) head_dim = config.head_dim for layer in past_key_values.layers: keys, values = layer.keys, layer.values self.assertIsInstance(keys, torch.Tensor) self.assertEqual(keys.shape[0], batch_size) self.assertEqual(keys.shape[1], num_kv_heads) self.assertEqual(keys.shape[3], head_dim) self.assertEqual(keys.shape, values.shape) @unittest.skip( reason=( "V4's conversion mapping is two-pass: a structural prefix rename " "(``layers.X.attn.`` → ``model.layers.X.self_attn.``) runs first, then specific in-prefix " "renames operate on the already-prefixed HF-form keys (``model.layers.X.self_attn.compressor.norm.`` " "→ ``...compressor.kv_norm.``). This split is load-bearing for save / load round-tripping — " "any single-pass ordering loses information in either direction (the general prefix rule " "and a specific in-prefix rule both want to match the same upstream key, and one of the " "two directions ends up with the general rule stealing the match). The base " "``test_reverse_loading_mapping`` checks every source pattern against the *upstream-form* " "serialized keys, so the Pass 2 patterns (written in HF form) inherently can't satisfy " "that invariant. The actual round-trip is exercised by ``test_save_load``." ) ) def test_reverse_loading_mapping(self): pass @unittest.skip( reason=( "V4's compressor pools windows of ``compress_rate`` consecutive tokens *before* the " "attention mask is applied — left-padding shifts the window boundaries so pad tokens " "get folded into the pooled KV entries, and the resulting logits diverge from the " "unpadded run by design (same fundamental limitation as RecurrentGemma)." ) ) def test_left_padding_compatibility(self): pass def _check_hidden_states_for_generate( self, batch_size, hidden_states, prompt_length, output_length, config, use_cache=False ): # V4's per-layer hidden states carry an extra ``hc_mult`` dim (Hyper-Connection # parallel streams). We skip the exact seq-length assertion the base tester does, # because assisted-decoding feeds arbitrary draft-token batches in, and just # sanity-check batch / hidden dims. import torch # noqa: PLC0415 self.assertIsInstance(hidden_states, tuple) self.assertEqual(len(hidden_states), (output_length - prompt_length)) for iter_hidden_states in hidden_states: self.assertIsInstance(iter_hidden_states, tuple) for layer_hidden in iter_hidden_states: self.assertIsInstance(layer_hidden, torch.Tensor) self.assertEqual(layer_hidden.shape[0], batch_size) self.assertEqual(layer_hidden.shape[-1], config.hidden_size) @require_torch @require_torch_accelerator @slow class DeepseekV4IntegrationTest(unittest.TestCase): """End-to-end check on the published DeepSeek-V4-Flash checkpoint. Loads the real 43-layer FP8 weights, dequantizes on the fly via :class:`FineGrainedFP8Config`, and greedy-generates a continuation of a fixed prompt. The forward path that this test covers is everything past the typical tiny-config tests can reach: the per-layer FP8 dequant in ``update_weight_conversions``, the ``compress_ratios → layer_types`` config translation (sliding / CSA / HCA), the ``coff=2`` overlap-window pooling on CSA layers and the indexer's inner pool, the per-head Q rescale in :class:`DeepseekV4Attention`, the YaRN-blended ``compress_rope_theta`` in the compressor, the trailing-rope partial-RoPE convention, and the cross-layer Hyper-Connection signal propagation. Any regression in those would tip generation back into a single-token collapse or pure ```` output (the failure modes we hit while landing the architecture). Marked ``@slow`` because the checkpoint is ~700 GB on disk and only loadable on a multi-GPU host (``device_map="auto"`` plus FP8 dequant materializes the weights in bf16). Run manually with:: RUN_SLOW=1 pytest tests/models/deepseek_v4/test_modeling_deepseek_v4.py::DeepseekV4IntegrationTest -k generation -s """ model_id = "deepseek-ai/DeepSeek-V4-Flash" prompt = "Pipeline parallelism in ai is " def test_v4_flash_dequantized_generation(self): quantization_config = FineGrainedFP8Config(dequantize=True) config = AutoConfig.from_pretrained(self.model_id) tokenizer = AutoTokenizer.from_pretrained(self.model_id) model = AutoModelForCausalLM.from_pretrained( self.model_id, config=config, dtype="auto", device_map="auto", attn_implementation="eager", quantization_config=quantization_config, ) inputs = tokenizer(self.prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output_ids = model.generate(**inputs, max_new_tokens=64, do_sample=False) # Snapshot of greedy-decoded text. The exact continuation is deterministic # under ``do_sample=False`` for a fixed prompt — if this snapshot drifts, # something in the V4 forward / RoPE / Q-rescale / HC stack changed. expected = ( "Pipeline parallelism in ai is a technique where a model is split across multiple devices, " "with each device responsible for a subset of layers. This allows for training of larger " "models that cannot fit on a single device. However, it introduces idle time (bubbles) due to " "sequential dependencies between stages. Techniques like micro-batching and gradient " "accumulation are used" ) decoded = tokenizer.decode(output_ids[0], skip_special_tokens=False) self.assertEqual(decoded, expected) def test_v4_flash_dequantized_chat_seven_prompts(self): """Chat-templated greedy generation across 7 prompts of varying length. Covers: short factual recall (1, 2), translation (3), code generation (4), out-of-distribution recall (5), open-ended creative writing (6), and a long-context summarisation (7, 234 input tokens — exercises the HCA path since input >> ``compress_rates['heavily_compressed_attention']`` = 128). Each completion is a fixed snapshot of the current greedy output. If any snapshot drifts, something changed in: per-layer-type RoPE selection (sliding ``main`` vs CSA / HCA ``compress``), the CSA / HCA per-query ``block_bias`` causal mask, the Hyper-Connection Sinkhorn projection or the residual mixing direction, or the fp32 promotion in the MoE path. """ long_prompt = ( "Please read the following extended passage carefully and then provide a concise " "three-sentence summary that captures the main themes and the most important details. " 'Be precise and avoid restating the wording; paraphrase. Passage: "It is a truth ' "universally acknowledged, that a single man in possession of a good fortune, must be " "in want of a wife. However little known the feelings or views of such a man may be " "on his first entering a neighbourhood, this truth is so well fixed in the minds of " "the surrounding families, that he is considered the rightful property of some one or " "other of their daughters. 'My dear Mr. Bennet,' said his lady to him one day, 'have " "you heard that Netherfield Park is let at last?' Mr. Bennet replied that he had not. " "'But it is,' returned she; 'for Mrs. Long has just been here, and she told me all " "about it.' Mr. Bennet made no answer. 'Do not you want to know who has taken it?' " "cried his wife impatiently. 'You want to tell me, and I have no objection to hearing " "it.' This was invitation enough.\"" ) cases: list[tuple[str, str]] = [ ( "The capital of France is", "The capital of France is Paris.", ), ( "List the first ten prime numbers:", "The first ten prime numbers are:\n\n2, 3, 5, 7, 11, 13, 17, 19, 23, 29", ), ( "Translate to French: 'The quick brown fox jumps over the lazy dog.'", '"Le rapide renard brun saute par-dessus le chien paresseux."', ), ( "Write a Python function fibonacci(n) that returns the nth Fibonacci number.", ( "Here's a Python function that returns the nth Fibonacci number:\n\n" "## Method 1: Iterative (Most Efficient)\n\n" '```python\ndef fibonacci(n):\n """\n Returns the nth Fibonacci number.\n \n' " Args:\n n: Non-negative integer (0-indexed: fib(0)=0, fib(1)=" ), ), ( "What are the three properties of the UE8M0 scale factor format?", ( "Based on the standard naming convention for fixed-point data types, the **UE8M0** " "format has the following three properties:\n\n" "1. **Unsigned (U):** The value is an unsigned integer. It cannot represent negative " "numbers.\n2. **8 Integer Bits (E8):** The integer part" ), ), ( 'Write a short story that begins with: "Once upon a time, in a forest far away, there lived a..."', ( "Once upon a time, in a forest far away, there lived a squirrel named Pip who could " "not store nuts. While every other squirrel in the Great Wood spent the golden autumn " "days frantically burying acorns and hazelnuts, Pip simply… forgot. He’d find a " "perfect, glossy acorn, hold" ), ), ( long_prompt, ( "The opening establishes a societal assumption that wealthy single men are naturally " "seeking wives, making them prime targets for local families with eligible daughters. " "Mrs. Bennet eagerly informs her indifferent husband that Netherfield Park has been " "leased, hoping to spark his interest in the new, presumably wealthy, tenant. Their " "exchange highlights the central theme" ), ), ] quantization_config = FineGrainedFP8Config(dequantize=True) config = AutoConfig.from_pretrained(self.model_id) tokenizer = AutoTokenizer.from_pretrained(self.model_id) model = AutoModelForCausalLM.from_pretrained( self.model_id, config=config, dtype="auto", device_map="auto", attn_implementation="eager", quantization_config=quantization_config, ) for i, (prompt, expected) in enumerate(cases, start=1): with self.subTest(prompt_index=i): inputs = tokenizer(_v4_chat(prompt), return_tensors="pt", add_special_tokens=False).to(model.device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=64, do_sample=False, pad_token_id=tokenizer.eos_token_id, ) new_tokens = output_ids[0, inputs.input_ids.size(1) :] completion = tokenizer.decode(new_tokens, skip_special_tokens=True) self.assertEqual(completion, expected) # `{loadtime_dispatch}` is forwarded to `from_pretrained(experts_implementation=…)` # (set once at load time). `{runtime_dispatches}` is the tuple of impls the worker # exercises sequentially via `set_experts_implementation` on that load. The remaining # placeholders (`model_id`, `prompt`, `expected`, `add_special_tokens`) let the same # worker drive multiple FP8 V4 variants — the expected string is asserted as a # substring of the decoded output so it works for both base-completion and instruct # models. Instruct prompts include the literal ``<|begin▁of▁sentence|>`` and pair with # ``add_special_tokens=False``; base-completion prompts skip it and pair with True. _DISTRIBUTED_WORKER_SCRIPT_TEMPLATE = """\ import os import sys import torch import torch.distributed as dist from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.distributed import DistributedConfig from transformers.utils.quantization_config import FineGrainedFP8Config LOADTIME_DISPATCH = {loadtime_dispatch!r} RUNTIME_DISPATCHES = {runtime_dispatches!r} MODEL_ID = {model_id!r} PROMPT = {prompt!r} EXPECTED = {expected!r} ADD_SPECIAL_TOKENS = {add_special_tokens!r} def main() -> int: # `from_pretrained(distributed_config=…)` auto-sets `tp_plan="auto"`, which calls # `initialize_tensor_parallelism`, which calls `torch.cuda.set_device(local_rank)` # and `init_process_group` from the RANK/LOCAL_RANK/WORLD_SIZE env vars torchrun sets. rank = int(os.environ["RANK"]) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype="auto", attn_implementation="eager", experts_implementation=LOADTIME_DISPATCH, distributed_config=DistributedConfig(enable_expert_parallel=True), quantization_config=FineGrainedFP8Config(dequantize=False), ) model.eval() tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) inputs = tokenizer(PROMPT, return_tensors="pt", add_special_tokens=ADD_SPECIAL_TOKENS).to(model.device) failed = [] for dispatch in RUNTIME_DISPATCHES: model.set_experts_implementation(dispatch) dist.barrier() with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=64, do_sample=False, pad_token_id=tokenizer.eos_token_id ) dist.barrier() if rank == 0: decoded = tokenizer.decode(out[0], skip_special_tokens=True) print(f"[{{dispatch}}] decoded: {{decoded!r}}", flush=True) # Normalize internal whitespace so trivial extra spaces (e.g. from kernels # emitting an odd tokenization for a comma-separated list) don't fail an # otherwise-correct generation. if " ".join(EXPECTED.split()) not in " ".join(decoded.split()): failed.append(f"[{{dispatch}}] {{decoded!r}} does not contain {{EXPECTED!r}}") dist.barrier() dist.destroy_process_group() if rank == 0 and failed: print("FAILED:\\n" + "\\n".join(failed), flush=True) return 1 return 0 if __name__ == "__main__": sys.exit(main()) """ def _run_distributed_worker( loadtime_dispatch, runtime_dispatches, model_id: str, prompt: str, expected: str, add_special_tokens: bool, ) -> int: script = _DISTRIBUTED_WORKER_SCRIPT_TEMPLATE.format( loadtime_dispatch=loadtime_dispatch, runtime_dispatches=tuple(runtime_dispatches), model_id=model_id, prompt=prompt, expected=expected, add_special_tokens=add_special_tokens, ) num_gpus = torch.cuda.device_count() # Redirect only stdout (`:1`) for ranks 1..N-1 to suppress duplicated generation chatter. # Stderr is left attached so worker tracebacks (OOM, NCCL, kernel crash) surface in the # subprocess stderr and the test failure message — `:3` would file-log both and turn any # rank>0 crash into a bare non-zero return code with no diagnostic. redirects = ",".join(f"{r}:1" for r in range(1, num_gpus)) with tempfile.NamedTemporaryFile("w", suffix="_distributed_worker.py") as f: f.write(script) f.flush() result = subprocess.run( ["torchrun", f"--nproc_per_node={num_gpus}", f"--redirects={redirects}", f.name], check=False, ) return result.returncode @require_torch @require_torch_n_accelerators(8) @require_torch_large_accelerator(memory=64) @require_cuda_capability_at_least(10, 0) @slow class DeepseekV4FlashIntegrationTest(unittest.TestCase): """Multi-device native FP4 generation on DSv4-Flash, via `torchrun` + EP=8. - `test_v4_flash_fp4_generation`: one model load, loops eager → deepgemm. - `test_v4_flash_fp4_generation_megamoe`: separate load with `experts_implementation="deepgemm_megamoe"` (TP plan + weight layout are committed at load and can't be switched at runtime). No ``device_map="auto"`` test — FP4 weights require DeepGEMM (Triton has no FP4 path), and DeepGEMM doesn't tolerate single-process multi-GPU, so the only working configuration for Flash is the distributed EP=8 setup above. """ model_id = "deepseek-ai/DeepSeek-V4-Flash" prompt = _v4_chat("List the first ten prime numbers:") expected_primes = "2, 3, 5, 7, 11, 13, 17, 19, 23, 29" def test_v4_flash_fp4_generation(self): rc = _run_distributed_worker( loadtime_dispatch=None, runtime_dispatches=("eager", "deepgemm"), model_id=self.model_id, prompt=self.prompt, expected=self.expected_primes, add_special_tokens=False, ) self.assertEqual(rc, 0, "torchrun worker failed; see stdout above") def test_v4_flash_fp4_generation_megamoe(self): rc = _run_distributed_worker( loadtime_dispatch="deepgemm_megamoe", runtime_dispatches=("deepgemm_megamoe",), model_id=self.model_id, prompt=self.prompt, expected=self.expected_primes, add_special_tokens=False, ) self.assertEqual(rc, 0, "torchrun worker failed; see stdout above") @require_torch @require_torch_n_accelerators(8) @require_torch_large_accelerator(memory=64) @require_cuda_capability_at_least(9, 0) @slow class DeepseekV4FlashBaseIntegrationTest(unittest.TestCase): """Multi-device native FP8 generation on DSv4-Flash-Base. Mirrors :class:`DeepseekV4FlashIntegrationTest` (FP4 mixed) but for the base completion variant. - `test_v4_flash_base_fp8_generation`: EP=8 via `torchrun`, exercises all three experts impls (``eager``, ``grouped_mm``, ``deepgemm``) — distributed gives every impl a working configuration since each rank drives one device. - `test_v4_flash_base_fp8_generation_device_map_auto`: single-process multi-GPU via ``device_map="auto"``. The ``deepgemm`` experts impl is excluded because DeepGEMM kernels race in this regime (see :func:`_assert_single_device`). """ model_id = "deepseek-ai/DeepSeek-V4-Flash-Base" prompt = "Here is the list of the first ten prime numbers, separated by commas:" expected_primes = "2, 3, 5, 7, 11, 13, 17, 19, 23, 29" def test_v4_flash_base_fp8_generation(self): rc = _run_distributed_worker( loadtime_dispatch=None, runtime_dispatches=("eager", "grouped_mm", "deepgemm"), model_id=self.model_id, prompt=self.prompt, expected=self.expected_primes, add_special_tokens=True, ) self.assertEqual(rc, 0, "torchrun worker failed; see stdout above") def test_v4_flash_base_fp8_generation_device_map_auto(self): tokenizer = AutoTokenizer.from_pretrained(self.model_id) model = AutoModelForCausalLM.from_pretrained( self.model_id, dtype="auto", device_map="auto", attn_implementation="eager", quantization_config=FineGrainedFP8Config(dequantize=False), ) inputs = tokenizer(self.prompt, return_tensors="pt").to(model.device) prompt_len = inputs.input_ids.size(1) # `deepgemm` experts impl is excluded — DeepGEMM kernels race in single-process # multi-GPU runs (which `device_map="auto"` always is for a model this size). for impl in ("eager", "grouped_mm"): model.set_experts_implementation(impl) with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=64, do_sample=False, pad_token_id=tokenizer.eos_token_id) completion = tokenizer.decode(out[0, prompt_len:], skip_special_tokens=True) self.assertIn( " ".join(self.expected_primes.split()), " ".join(completion.split()), f"[{impl}] {completion!r}", )