first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/deepseek_v4/test_modeling_deepseek_v4.py
+++ b/tests/models/deepseek_v4/test_modeling_deepseek_v4.py
@@ -0,0 +1,618 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+import tempfile
+import unittest
+
+from parameterized import parameterized
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    require_cuda_capability_at_least,
+    require_torch,
+    require_torch_accelerator,
+    require_torch_large_accelerator,
+    require_torch_n_accelerators,
+    slow,
+    torch_device,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoConfig,
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        DeepseekV4Model,
+        FineGrainedFP8Config,
+    )
+
+from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
+
+
+def _v4_chat(prompt: str) -> str:
+    """V4-Flash chat-mode template (canonical form in ``encoding/encoding_dsv4.py``
+    on the model repo). ``</think>`` after ``<｜Assistant｜>`` skips the reasoning
+    block and goes straight to the answer. Pair with ``add_special_tokens=False``
+    when tokenizing — the literal BOS is already in the template."""
+    return f"<｜begin▁of▁sentence｜><｜User｜>{prompt}<｜Assistant｜></think>"
+
+
+class DeepseekV4ModelTester(CausalLMModelTester):
+    if is_torch_available():
+        base_model_class = DeepseekV4Model
+
+    def __init__(self, parent, **kwargs):
+        super().__init__(parent, **kwargs)
+        # Standard CausalLMModelTester knobs — override the parent's positional defaults.
+        self.hidden_size = 64
+        self.num_attention_heads = 4
+        self.num_key_value_heads = 1
+        self.num_hidden_layers = 2
+        self.num_experts_per_tok = 2
+        self.moe_intermediate_size = 64
+        self.max_position_embeddings = 64
+        # V4-only knobs.
+        self.head_dim = 32
+        self.partial_rotary_factor = 8 / 32  # qk_rope_head_dim=8 / head_dim=32
+        self.q_lora_rank = 32
+        self.o_groups = 2
+        self.o_lora_rank = 16
+        self.n_routed_experts = 4
+        self.n_shared_experts = 1
+        # All "moe" (no "hash_moe") so inputs_embeds-only generation tests in
+        # CausalLMModelTest exercise the model without hitting the hash router's
+        # input_ids requirement. A dedicated test covers the hash path.
+        self.mlp_layer_types = ["moe", "moe"]
+        self.layer_types = ["heavily_compressed_attention", "compressed_sparse_attention"]
+        self.sliding_window = 8
+        self.hc_mult = 2
+        self.hc_sinkhorn_iters = 3
+        self.hc_eps = 1.0e-6
+        self.index_n_heads = 2
+        self.index_head_dim = 16
+        self.index_topk = 2
+        self.num_nextn_predict_layers = 0
+        self.scoring_func = "sqrtsoftplus"
+        self.routed_scaling_factor = 1.5
+        self.swiglu_limit = 10.0
+        self.rope_theta = 10000.0
+        self.compress_rope_theta = 160000.0
+        self.attention_bias = False
+        self.attention_dropout = 0.0
+
+
+@require_torch
+class DeepseekV4ModelTest(CausalLMModelTest, unittest.TestCase):
+    model_tester_class = DeepseekV4ModelTester
+
+    # Indexer parameters only influence the argmax over compressed positions (``topk``),
+    # which is non-differentiable — their gradients flow through a separate objective in
+    # the upstream training recipe, not the main causal-LM loss.
+    test_all_params_have_gradient = False
+
+    # No SequenceClassification / TokenClassification / QA heads on V4.
+    def is_pipeline_test_to_skip(self, *args, **kwargs):
+        return True
+
+    @unittest.skip(
+        "V4's `DeepseekV4GroupedLinear` uses `torch.bmm` for the per-group matmul; "
+        "torchao's Float8Tensor only fast-paths `F.linear` (bmm needs the optional `mslk` "
+        "kernel) so the quantized-TP path fails. A custom V4 FP8 path will land later."
+    )
+    def test_tp_generation_quantized(self):
+        pass
+
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values
+    ):
+        # V4 layers with a Compressor attend to extra pooled positions, so the KV
+        # length varies per layer. We only check the shape invariants: batched, same
+        # number-of-heads and query-length; the KV-length axis may differ across layers.
+        import torch  # noqa: PLC0415
+
+        self.assertIsInstance(attentions, tuple)
+        self.assertEqual(len(attentions), (output_length - prompt_length))
+        for _, iter_attentions in enumerate(attentions):
+            self.assertIsInstance(iter_attentions, tuple)
+            for layer_attention in iter_attentions:
+                self.assertIsInstance(layer_attention, torch.Tensor)
+                self.assertEqual(layer_attention.shape[0], batch_size)
+                self.assertEqual(layer_attention.shape[1], config.num_attention_heads)
+
+    @unittest.skip(
+        "V4's rotary uses per-layer-type inv_freq buffers (Gemma3 pattern); the common test calls forward without `layer_type` and reads `.inv_freq`, neither of which apply."
+    )
+    def test_model_rope_scaling_frequencies(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
+    @unittest.skip(
+        "V4's rotary uses per-layer-type rope_parameters; the common test sets a flat dict and skips for multi-layer-type rotaries."
+    )
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        pass
+
+    def test_hidden_states_output(self):
+        # V4 layers emit a 4D ``[B, S, hc_mult, hidden]`` tensor — the hc_mult streams
+        # are only collapsed at the top of the model via ``hc_head``. The common
+        # ``test_hidden_states_output`` assumes ``(batch, seq, hidden)``; we re-run the
+        # same check but accept the extra HC axis, and we additionally assert the final
+        # (post-hc_head) ``last_hidden_state`` has the standard 3D shape.
+        import torch  # noqa: PLC0415
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device).eval()
+            with torch.no_grad():
+                outputs = model(**inputs_dict)
+            hidden_states = outputs.hidden_states if hasattr(outputs, "hidden_states") else outputs[-1]
+            self.assertIsNotNone(hidden_states)
+            self.assertEqual(len(hidden_states), config.num_hidden_layers + 1)
+            seq_len = inputs_dict["input_ids"].shape[1]
+            for layer_h in hidden_states:
+                # Accept either the collapsed (3D) post-head shape or the per-layer 4D shape.
+                if layer_h.ndim == 3:
+                    self.assertEqual(layer_h.shape, (inputs_dict["input_ids"].shape[0], seq_len, config.hidden_size))
+                else:
+                    self.assertEqual(
+                        layer_h.shape,
+                        (inputs_dict["input_ids"].shape[0], seq_len, config.hc_mult, config.hidden_size),
+                    )
+
+    def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config):
+        # Every V4 layer is sliding-window, so the cache is length-bounded to
+        # ``sliding_window`` instead of the full ``seq_length`` the parent tester expects.
+        # We also accept the compressed-segment positions that ``DeepseekV4Attention``
+        # appends on compress layers (they live beyond the window on the keys axis).
+        import torch  # noqa: PLC0415
+
+        num_kv_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+        head_dim = config.head_dim
+        for layer in past_key_values.layers:
+            keys, values = layer.keys, layer.values
+            self.assertIsInstance(keys, torch.Tensor)
+            self.assertEqual(keys.shape[0], batch_size)
+            self.assertEqual(keys.shape[1], num_kv_heads)
+            self.assertEqual(keys.shape[3], head_dim)
+            self.assertEqual(keys.shape, values.shape)
+
+    @unittest.skip(
+        reason=(
+            "V4's conversion mapping is two-pass: a structural prefix rename "
+            "(``layers.X.attn.`` → ``model.layers.X.self_attn.``) runs first, then specific in-prefix "
+            "renames operate on the already-prefixed HF-form keys (``model.layers.X.self_attn.compressor.norm.`` "
+            "→ ``...compressor.kv_norm.``). This split is load-bearing for save / load round-tripping — "
+            "any single-pass ordering loses information in either direction (the general prefix rule "
+            "and a specific in-prefix rule both want to match the same upstream key, and one of the "
+            "two directions ends up with the general rule stealing the match). The base "
+            "``test_reverse_loading_mapping`` checks every source pattern against the *upstream-form* "
+            "serialized keys, so the Pass 2 patterns (written in HF form) inherently can't satisfy "
+            "that invariant. The actual round-trip is exercised by ``test_save_load``."
+        )
+    )
+    def test_reverse_loading_mapping(self):
+        pass
+
+    @unittest.skip(
+        reason=(
+            "V4's compressor pools windows of ``compress_rate`` consecutive tokens *before* the "
+            "attention mask is applied — left-padding shifts the window boundaries so pad tokens "
+            "get folded into the pooled KV entries, and the resulting logits diverge from the "
+            "unpadded run by design (same fundamental limitation as RecurrentGemma)."
+        )
+    )
+    def test_left_padding_compatibility(self):
+        pass
+
+    def _check_hidden_states_for_generate(
+        self, batch_size, hidden_states, prompt_length, output_length, config, use_cache=False
+    ):
+        # V4's per-layer hidden states carry an extra ``hc_mult`` dim (Hyper-Connection
+        # parallel streams). We skip the exact seq-length assertion the base tester does,
+        # because assisted-decoding feeds arbitrary draft-token batches in, and just
+        # sanity-check batch / hidden dims.
+        import torch  # noqa: PLC0415
+
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertEqual(len(hidden_states), (output_length - prompt_length))
+        for iter_hidden_states in hidden_states:
+            self.assertIsInstance(iter_hidden_states, tuple)
+            for layer_hidden in iter_hidden_states:
+                self.assertIsInstance(layer_hidden, torch.Tensor)
+                self.assertEqual(layer_hidden.shape[0], batch_size)
+                self.assertEqual(layer_hidden.shape[-1], config.hidden_size)
+
+
+@require_torch
+@require_torch_accelerator
+@slow
+class DeepseekV4IntegrationTest(unittest.TestCase):
+    """End-to-end check on the published DeepSeek-V4-Flash checkpoint.
+
+    Loads the real 43-layer FP8 weights, dequantizes on the fly via
+    :class:`FineGrainedFP8Config`, and greedy-generates a continuation of a fixed
+    prompt. The forward path that this test covers is everything past the typical
+    tiny-config tests can reach: the per-layer FP8 dequant in
+    ``update_weight_conversions``, the ``compress_ratios → layer_types`` config
+    translation (sliding / CSA / HCA), the ``coff=2`` overlap-window pooling on CSA
+    layers and the indexer's inner pool, the per-head Q rescale in
+    :class:`DeepseekV4Attention`, the YaRN-blended ``compress_rope_theta`` in the
+    compressor, the trailing-rope partial-RoPE convention, and the cross-layer
+    Hyper-Connection signal propagation. Any regression in those would tip
+    generation back into a single-token collapse or pure ``<EOS>`` output (the
+    failure modes we hit while landing the architecture).
+
+    Marked ``@slow`` because the checkpoint is ~700 GB on disk and only loadable
+    on a multi-GPU host (``device_map="auto"`` plus FP8 dequant materializes the
+    weights in bf16). Run manually with::
+
+        RUN_SLOW=1 pytest tests/models/deepseek_v4/test_modeling_deepseek_v4.py::DeepseekV4IntegrationTest -k generation -s
+    """
+
+    model_id = "deepseek-ai/DeepSeek-V4-Flash"
+    prompt = "Pipeline parallelism in ai is "
+
+    def test_v4_flash_dequantized_generation(self):
+        quantization_config = FineGrainedFP8Config(dequantize=True)
+        config = AutoConfig.from_pretrained(self.model_id)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            config=config,
+            dtype="auto",
+            device_map="auto",
+            attn_implementation="eager",
+            quantization_config=quantization_config,
+        )
+
+        inputs = tokenizer(self.prompt, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=64, do_sample=False)
+
+        # Snapshot of greedy-decoded text. The exact continuation is deterministic
+        # under ``do_sample=False`` for a fixed prompt — if this snapshot drifts,
+        # something in the V4 forward / RoPE / Q-rescale / HC stack changed.
+        expected = (
+            "Pipeline parallelism in ai is  a technique where a model is split across multiple devices, "
+            "with each device responsible for a subset of layers. This allows for training of larger "
+            "models that cannot fit on a single device. However, it introduces idle time (bubbles) due to "
+            "sequential dependencies between stages. Techniques like micro-batching and gradient "
+            "accumulation are used"
+        )
+        decoded = tokenizer.decode(output_ids[0], skip_special_tokens=False)
+        self.assertEqual(decoded, expected)
+
+    def test_v4_flash_dequantized_chat_seven_prompts(self):
+        """Chat-templated greedy generation across 7 prompts of varying length.
+
+        Covers: short factual recall (1, 2), translation (3), code generation (4),
+        out-of-distribution recall (5), open-ended creative writing (6), and a
+        long-context summarisation (7, 234 input tokens — exercises the HCA path
+        since input >> ``compress_rates['heavily_compressed_attention']`` = 128).
+        Each completion is a fixed snapshot of the current greedy output. If any
+        snapshot drifts, something changed in: per-layer-type RoPE selection
+        (sliding ``main`` vs CSA / HCA ``compress``), the CSA / HCA per-query
+        ``block_bias`` causal mask, the Hyper-Connection Sinkhorn projection or
+        the residual mixing direction, or the fp32 promotion in the MoE path.
+        """
+
+        long_prompt = (
+            "Please read the following extended passage carefully and then provide a concise "
+            "three-sentence summary that captures the main themes and the most important details. "
+            'Be precise and avoid restating the wording; paraphrase. Passage: "It is a truth '
+            "universally acknowledged, that a single man in possession of a good fortune, must be "
+            "in want of a wife. However little known the feelings or views of such a man may be "
+            "on his first entering a neighbourhood, this truth is so well fixed in the minds of "
+            "the surrounding families, that he is considered the rightful property of some one or "
+            "other of their daughters. 'My dear Mr. Bennet,' said his lady to him one day, 'have "
+            "you heard that Netherfield Park is let at last?' Mr. Bennet replied that he had not. "
+            "'But it is,' returned she; 'for Mrs. Long has just been here, and she told me all "
+            "about it.' Mr. Bennet made no answer. 'Do not you want to know who has taken it?' "
+            "cried his wife impatiently. 'You want to tell me, and I have no objection to hearing "
+            "it.' This was invitation enough.\""
+        )
+
+        cases: list[tuple[str, str]] = [
+            (
+                "The capital of France is",
+                "The capital of France is Paris.",
+            ),
+            (
+                "List the first ten prime numbers:",
+                "The first ten prime numbers are:\n\n2, 3, 5, 7, 11, 13, 17, 19, 23, 29",
+            ),
+            (
+                "Translate to French: 'The quick brown fox jumps over the lazy dog.'",
+                '"Le rapide renard brun saute par-dessus le chien paresseux."',
+            ),
+            (
+                "Write a Python function fibonacci(n) that returns the nth Fibonacci number.",
+                (
+                    "Here's a Python function that returns the nth Fibonacci number:\n\n"
+                    "## Method 1: Iterative (Most Efficient)\n\n"
+                    '```python\ndef fibonacci(n):\n    """\n    Returns the nth Fibonacci number.\n    \n'
+                    "    Args:\n        n: Non-negative integer (0-indexed: fib(0)=0, fib(1)="
+                ),
+            ),
+            (
+                "What are the three properties of the UE8M0 scale factor format?",
+                (
+                    "Based on the standard naming convention for fixed-point data types, the **UE8M0** "
+                    "format has the following three properties:\n\n"
+                    "1.  **Unsigned (U):** The value is an unsigned integer. It cannot represent negative "
+                    "numbers.\n2.  **8 Integer Bits (E8):** The integer part"
+                ),
+            ),
+            (
+                'Write a short story that begins with: "Once upon a time, in a forest far away, there lived a..."',
+                (
+                    "Once upon a time, in a forest far away, there lived a squirrel named Pip who could "
+                    "not store nuts. While every other squirrel in the Great Wood spent the golden autumn "
+                    "days frantically burying acorns and hazelnuts, Pip simply… forgot. He’d find a "
+                    "perfect, glossy acorn, hold"
+                ),
+            ),
+            (
+                long_prompt,
+                (
+                    "The opening establishes a societal assumption that wealthy single men are naturally "
+                    "seeking wives, making them prime targets for local families with eligible daughters. "
+                    "Mrs. Bennet eagerly informs her indifferent husband that Netherfield Park has been "
+                    "leased, hoping to spark his interest in the new, presumably wealthy, tenant. Their "
+                    "exchange highlights the central theme"
+                ),
+            ),
+        ]
+
+        quantization_config = FineGrainedFP8Config(dequantize=True)
+        config = AutoConfig.from_pretrained(self.model_id)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            config=config,
+            dtype="auto",
+            device_map="auto",
+            attn_implementation="eager",
+            quantization_config=quantization_config,
+        )
+
+        for i, (prompt, expected) in enumerate(cases, start=1):
+            with self.subTest(prompt_index=i):
+                inputs = tokenizer(_v4_chat(prompt), return_tensors="pt", add_special_tokens=False).to(model.device)
+                with torch.no_grad():
+                    output_ids = model.generate(
+                        **inputs,
+                        max_new_tokens=64,
+                        do_sample=False,
+                        pad_token_id=tokenizer.eos_token_id,
+                    )
+                new_tokens = output_ids[0, inputs.input_ids.size(1) :]
+                completion = tokenizer.decode(new_tokens, skip_special_tokens=True)
+                self.assertEqual(completion, expected)
+
+
+# `{loadtime_dispatch}` is forwarded to `from_pretrained(experts_implementation=…)`
+# (set once at load time). `{runtime_dispatches}` is the tuple of impls the worker
+# exercises sequentially via `set_experts_implementation` on that load. The remaining
+# placeholders (`model_id`, `prompt`, `expected`, `add_special_tokens`) let the same
+# worker drive multiple FP8 V4 variants — the expected string is asserted as a
+# substring of the decoded output so it works for both base-completion and instruct
+# models. Instruct prompts include the literal ``<｜begin▁of▁sentence｜>`` and pair with
+# ``add_special_tokens=False``; base-completion prompts skip it and pair with True.
+_DISTRIBUTED_WORKER_SCRIPT_TEMPLATE = """\
+import os
+import sys
+
+import torch
+import torch.distributed as dist
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.distributed import DistributedConfig
+from transformers.utils.quantization_config import FineGrainedFP8Config
+
+
+LOADTIME_DISPATCH = {loadtime_dispatch!r}
+RUNTIME_DISPATCHES = {runtime_dispatches!r}
+MODEL_ID = {model_id!r}
+PROMPT = {prompt!r}
+EXPECTED = {expected!r}
+ADD_SPECIAL_TOKENS = {add_special_tokens!r}
+
+
+def main() -> int:
+    # `from_pretrained(distributed_config=…)` auto-sets `tp_plan="auto"`, which calls
+    # `initialize_tensor_parallelism`, which calls `torch.cuda.set_device(local_rank)`
+    # and `init_process_group` from the RANK/LOCAL_RANK/WORLD_SIZE env vars torchrun sets.
+    rank = int(os.environ["RANK"])
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        dtype="auto",
+        attn_implementation="eager",
+        experts_implementation=LOADTIME_DISPATCH,
+        distributed_config=DistributedConfig(enable_expert_parallel=True),
+        quantization_config=FineGrainedFP8Config(dequantize=False),
+    )
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    inputs = tokenizer(PROMPT, return_tensors="pt", add_special_tokens=ADD_SPECIAL_TOKENS).to(model.device)
+
+    failed = []
+    for dispatch in RUNTIME_DISPATCHES:
+        model.set_experts_implementation(dispatch)
+        dist.barrier()
+        with torch.no_grad():
+            out = model.generate(
+                **inputs, max_new_tokens=64, do_sample=False, pad_token_id=tokenizer.eos_token_id
+            )
+        dist.barrier()
+        if rank == 0:
+            decoded = tokenizer.decode(out[0], skip_special_tokens=True)
+            print(f"[{{dispatch}}] decoded: {{decoded!r}}", flush=True)
+            # Normalize internal whitespace so trivial extra spaces (e.g. from kernels
+            # emitting an odd tokenization for a comma-separated list) don't fail an
+            # otherwise-correct generation.
+            if " ".join(EXPECTED.split()) not in " ".join(decoded.split()):
+                failed.append(f"[{{dispatch}}] {{decoded!r}} does not contain {{EXPECTED!r}}")
+
+    dist.barrier()
+    dist.destroy_process_group()
+    if rank == 0 and failed:
+        print("FAILED:\\n" + "\\n".join(failed), flush=True)
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+"""
+
+
+def _run_distributed_worker(
+    loadtime_dispatch,
+    runtime_dispatches,
+    model_id: str,
+    prompt: str,
+    expected: str,
+    add_special_tokens: bool,
+) -> int:
+    script = _DISTRIBUTED_WORKER_SCRIPT_TEMPLATE.format(
+        loadtime_dispatch=loadtime_dispatch,
+        runtime_dispatches=tuple(runtime_dispatches),
+        model_id=model_id,
+        prompt=prompt,
+        expected=expected,
+        add_special_tokens=add_special_tokens,
+    )
+    num_gpus = torch.cuda.device_count()
+    # Redirect only stdout (`:1`) for ranks 1..N-1 to suppress duplicated generation chatter.
+    # Stderr is left attached so worker tracebacks (OOM, NCCL, kernel crash) surface in the
+    # subprocess stderr and the test failure message — `:3` would file-log both and turn any
+    # rank>0 crash into a bare non-zero return code with no diagnostic.
+    redirects = ",".join(f"{r}:1" for r in range(1, num_gpus))
+    with tempfile.NamedTemporaryFile("w", suffix="_distributed_worker.py") as f:
+        f.write(script)
+        f.flush()
+        result = subprocess.run(
+            ["torchrun", f"--nproc_per_node={num_gpus}", f"--redirects={redirects}", f.name],
+            check=False,
+        )
+    return result.returncode
+
+
+@require_torch
+@require_torch_n_accelerators(8)
+@require_torch_large_accelerator(memory=64)
+@require_cuda_capability_at_least(10, 0)
+@slow
+class DeepseekV4FlashIntegrationTest(unittest.TestCase):
+    """Multi-device native FP4 generation on DSv4-Flash, via `torchrun` + EP=8.
+
+    - `test_v4_flash_fp4_generation`: one model load, loops eager → deepgemm.
+    - `test_v4_flash_fp4_generation_megamoe`: separate load with
+      `experts_implementation="deepgemm_megamoe"` (TP plan + weight layout are
+      committed at load and can't be switched at runtime).
+
+    No ``device_map="auto"`` test — FP4 weights require DeepGEMM (Triton has no FP4
+    path), and DeepGEMM doesn't tolerate single-process multi-GPU, so the only
+    working configuration for Flash is the distributed EP=8 setup above.
+    """
+
+    model_id = "deepseek-ai/DeepSeek-V4-Flash"
+    prompt = _v4_chat("List the first ten prime numbers:")
+    expected_primes = "2, 3, 5, 7, 11, 13, 17, 19, 23, 29"
+
+    def test_v4_flash_fp4_generation(self):
+        rc = _run_distributed_worker(
+            loadtime_dispatch=None,
+            runtime_dispatches=("eager", "deepgemm"),
+            model_id=self.model_id,
+            prompt=self.prompt,
+            expected=self.expected_primes,
+            add_special_tokens=False,
+        )
+        self.assertEqual(rc, 0, "torchrun worker failed; see stdout above")
+
+    def test_v4_flash_fp4_generation_megamoe(self):
+        rc = _run_distributed_worker(
+            loadtime_dispatch="deepgemm_megamoe",
+            runtime_dispatches=("deepgemm_megamoe",),
+            model_id=self.model_id,
+            prompt=self.prompt,
+            expected=self.expected_primes,
+            add_special_tokens=False,
+        )
+        self.assertEqual(rc, 0, "torchrun worker failed; see stdout above")
+
+
+@require_torch
+@require_torch_n_accelerators(8)
+@require_torch_large_accelerator(memory=64)
+@require_cuda_capability_at_least(9, 0)
+@slow
+class DeepseekV4FlashBaseIntegrationTest(unittest.TestCase):
+    """Multi-device native FP8 generation on DSv4-Flash-Base.
+
+    Mirrors :class:`DeepseekV4FlashIntegrationTest` (FP4 mixed) but for the base
+    completion variant.
+
+      - `test_v4_flash_base_fp8_generation`: EP=8 via `torchrun`, exercises all
+        three experts impls (``eager``, ``grouped_mm``, ``deepgemm``) — distributed
+        gives every impl a working configuration since each rank drives one device.
+      - `test_v4_flash_base_fp8_generation_device_map_auto`: single-process multi-GPU via
+        ``device_map="auto"``. The ``deepgemm`` experts impl is excluded because
+        DeepGEMM kernels race in this regime (see :func:`_assert_single_device`).
+    """
+
+    model_id = "deepseek-ai/DeepSeek-V4-Flash-Base"
+    prompt = "Here is the list of the first ten prime numbers, separated by commas:"
+    expected_primes = "2, 3, 5, 7, 11, 13, 17, 19, 23, 29"
+
+    def test_v4_flash_base_fp8_generation(self):
+        rc = _run_distributed_worker(
+            loadtime_dispatch=None,
+            runtime_dispatches=("eager", "grouped_mm", "deepgemm"),
+            model_id=self.model_id,
+            prompt=self.prompt,
+            expected=self.expected_primes,
+            add_special_tokens=True,
+        )
+        self.assertEqual(rc, 0, "torchrun worker failed; see stdout above")
+
+    def test_v4_flash_base_fp8_generation_device_map_auto(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            dtype="auto",
+            device_map="auto",
+            attn_implementation="eager",
+            quantization_config=FineGrainedFP8Config(dequantize=False),
+        )
+        inputs = tokenizer(self.prompt, return_tensors="pt").to(model.device)
+        prompt_len = inputs.input_ids.size(1)
+        # `deepgemm` experts impl is excluded — DeepGEMM kernels race in single-process
+        # multi-GPU runs (which `device_map="auto"` always is for a model this size).
+        for impl in ("eager", "grouped_mm"):
+            model.set_experts_implementation(impl)
+            with torch.no_grad():
+                out = model.generate(**inputs, max_new_tokens=64, do_sample=False, pad_token_id=tokenizer.eos_token_id)
+            completion = tokenizer.decode(out[0, prompt_len:], skip_special_tokens=True)
+            self.assertIn(
+                " ".join(self.expected_primes.split()),
+                " ".join(completion.split()),
+                f"[{impl}] {completion!r}",
+            )