first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/gpt_oss/test_modeling_gpt_oss.py
+++ b/tests/models/gpt_oss/test_modeling_gpt_oss.py
@@ -0,0 +1,633 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch GptOss model."""
+
+import difflib
+import inspect
+import json
+import os
+import subprocess
+import tempfile
+import unittest
+from pathlib import Path
+
+import pytest
+from parameterized import parameterized
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    is_torch_available,
+)
+from transformers.testing_utils import (
+    cleanup,
+    require_deterministic_for_xpu,
+    require_kernels,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        GptOssModel,
+    )
+
+    if torch.cuda.is_available():
+        NUM_GPUS = torch.cuda.device_count()
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        NUM_GPUS = torch.xpu.device_count()
+    else:
+        NUM_GPUS = 0
+
+
+class GptOssModelTester(CausalLMModelTester):
+    if is_torch_available():
+        base_model_class = GptOssModel
+
+
+@require_torch
+class GptOssModelTest(CausalLMModelTest, unittest.TestCase):
+    _is_stateful = True
+    model_split_percents = [0.5, 0.6]
+    model_tester_class = GptOssModelTester
+
+    @require_kernels
+    def test_kernelize_does_not_crash(self):
+        """Regression test #45799: `kernelize` should not crash with `use_kernelized_func` + `use_kernel_func_from_hub`."""
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        model = GptOssModel(config).to(device=torch_device)
+        # This used to raise TypeError because apply_rotary_pos_emb was not wrapped as nn.Module
+        model.use_kernels = True
+
+    @require_kernels
+    @pytest.mark.flash_attn_test
+    @require_torch_gpu
+    def test_default_flash_implementation_auto_correction(self):
+        """
+        Tests that setting attn_implementation="flash_attention_2" during model initialization
+        automatically corrects to the model's `_compatible_flash_implementations`.
+        """
+        from kernels import get_kernel
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        expected_kernel = "kernels-community/vllm-flash-attn3"
+        flash = get_kernel(expected_kernel)
+        if flash is None:
+            self.skipTest(f"{expected_kernel} is not available, skipping auto-correction test.")
+
+        # Option 1: Auto correction on setting config on init
+        config._attn_implementation = "flash_attention_2"
+        tmp_model = GptOssModel(config).to(device=torch_device, dtype=torch.bfloat16)
+        self.assertEqual(tmp_model.config._attn_implementation, expected_kernel)
+
+        # Option 2: Auto correction on load time
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            tmp_model.save_pretrained(tmp_dir_name)
+            model = GptOssModel.from_pretrained(tmp_dir_name, attn_implementation="flash_attention_2").to(
+                device=torch_device
+            )
+            self.assertEqual(model.config._attn_implementation, expected_kernel)
+
+        # Option 3: Auto correction on `set_attn_implementation`
+        model.set_attn_implementation("eager")
+        self.assertEqual(model.config._attn_implementation, "eager")
+        model.set_attn_implementation("flash_attention_2")
+        self.assertEqual(model.config._attn_implementation, expected_kernel)
+
+        # Verify model still works
+        with torch.no_grad():
+            output = model(**inputs_dict)
+        self.assertIsNotNone(output)
+
+    @unittest.skip("GptOss's forcefully disables sdpa due to Sink")
+    def test_sdpa_can_dispatch_non_composite_models(self):
+        pass
+
+    @unittest.skip("GptOss's eager attn/sdpa attn outputs are expected to be different")
+    def test_eager_matches_sdpa_generate(self):
+        pass
+
+    @unittest.skip("GptOss eager/FA2 attention outputs are expected to be different")
+    def test_flash_attn_2_equivalence(self):
+        pass
+
+    @unittest.skip("Most probably because of the MOE, the moe and router does not ignore padding tokens")
+    def test_eager_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+    @unittest.skip("GptOss does not support flex officially")
+    def test_flex_attention_with_grads(self):
+        pass
+
+    @unittest.skipIf(torch_device == "cpu", "GptOss does not support flex officially")
+    def test_generate_compile_model_forward_fullgraph(self):
+        return super().test_generate_compile_model_forward_fullgraph()
+
+    def test_reverse_loading_mapping(self, check_keys_were_modified=False):
+        super().test_reverse_loading_mapping(check_keys_were_modified=False)
+
+
+RESULTS_PATH = Path(__file__).parent.parent.parent / "fixtures/gpt_oss/integration_tests.json"
+
+
+# ------------------------
+# Worker function for distributed torchrun
+# ------------------------
+def distributed_worker(quantized, model_size, kernels, attn_impl, mode):
+    """This is the function that will be executed by torchrun workers."""
+    import os
+
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from transformers.testing_utils import torch_device
+
+    def generate_config_key(quantized, model, kernels, attn_impl, mode):
+        """Generate a key for the restructured integration test results."""
+        return f"device={torch_device}|quantized={str(quantized).lower()}|model={model}|kernels={str(kernels).lower()}|attn_impl={attn_impl}|mode={mode}"
+
+    input_text = [
+        "Roses are red, violets",
+        "How are you? Tell me the name of the president of",
+    ]
+
+    # Convert args
+    quantized = quantized.lower() == "true"
+    kernels = kernels.lower() == "true"
+
+    # Distributed model loading
+    model_id = f"openai/gpt-oss-{model_size}"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        dtype="auto",
+        tp_plan="auto",  # distributed inference
+        use_kernels=kernels,
+    ).to(torch_device)
+    model.set_attn_implementation(attn_impl)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
+
+    # Inference
+    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(torch_device)
+    output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+    output_texts = tokenizer.batch_decode(output, skip_special_tokens=False)
+
+    # Only rank 0 writes results and validates against expected outputs
+    if int(os.environ.get("RANK", "0")) == 0:
+        # Generate key to look up expected outputs
+        key = generate_config_key(quantized, model_size, kernels, attn_impl, mode)
+
+        # Load expected outputs from restructured JSON
+        if os.path.exists(RESULTS_PATH):
+            with open(RESULTS_PATH, "r") as f:
+                expected_results = json.load(f)
+
+            # Check if we have expected results for this configuration
+            if key in expected_results:
+                expected_outputs = expected_results[key]
+
+                # Compare actual outputs with expected outputs
+                assert len(output_texts) == len(expected_outputs), f"Output length mismatch for {key}"
+
+                for i, (actual, expected) in enumerate(zip(output_texts, expected_outputs)):
+                    actual_stripped = actual.strip()
+                    expected_stripped = expected.strip()
+
+                    # Make lengths match by taking minimum length to be resilient to generation differences
+                    min_length = min(len(actual_stripped), len(expected_stripped))
+                    actual_truncated = actual_stripped[:min_length]
+                    expected_truncated = expected_stripped[:min_length]
+
+                    if actual_truncated != expected_truncated:
+                        diff = "\n".join(
+                            difflib.unified_diff(
+                                expected_truncated.splitlines(keepends=True),
+                                actual_truncated.splitlines(keepends=True),
+                                fromfile=f"expected[{i}]",
+                                tofile=f"actual[{i}]",
+                                lineterm="",
+                            )
+                        )
+                        raise AssertionError(
+                            f"Output mismatch at index {i} for {key}:\n"
+                            f"Expected: '{expected_stripped}'\n"
+                            f"Actual:   '{actual_stripped}'\n"
+                            f"Diff (truncated to min length {min_length}):\n{diff}"
+                        )
+
+                print(f"✓ Outputs match expected results for {key}")
+            else:
+                print(f"Warning: No expected results found for configuration: {key}")
+        else:
+            print(f"Warning: Results file {RESULTS_PATH} not found")
+
+
+@slow
+class GptOssIntegrationTest(unittest.TestCase):
+    input_text = [
+        "Roses are red, violets",
+        "How are you? Tell me the name of the president of",
+    ]
+
+    @staticmethod
+    def generate_config_key(quantized, model, kernels, attn_impl, mode):
+        """Generate a key for the restructured integration test results."""
+        return f"device={torch_device}|quantized={str(quantized).lower()}|model={model}|kernels={str(kernels).lower()}|attn_impl={attn_impl}|mode={mode}"
+
+    def setUp(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    # ------------------------
+    # Non-distributed inference
+    # ------------------------
+    def load_and_forward(self, model_id, attn_implementation, input_text, mode="eval", **pretrained_kwargs):
+        if torch_device == "cpu":
+            if attn_implementation == "kernels-community/vllm-flash-attn3":
+                self.skipTest("vllm-flash-attn3 is not supported on CPU.")
+            if pretrained_kwargs.get("kernels", False) and mode == "train":
+                self.skipTest("CPU kernels only support inference.")
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            dtype=torch.bfloat16,
+            device_map="auto",
+            attn_implementation=attn_implementation,
+            **pretrained_kwargs,
+        )
+
+        # Set the correct mode
+        if mode == "train":
+            model.train()
+        else:
+            model.eval()
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
+
+        inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(model.device)
+        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+        return output_text
+
+    # ------------------------
+    # Distributed inference using inspect
+    # ------------------------
+    @staticmethod
+    def run_distributed_test(quantized, model, kernels, attn_impl, mode):
+        """Launch torchrun using a temporary worker file generated from inspect.getsource()."""
+        import textwrap
+
+        # Extract worker function source dynamically
+        worker_src = inspect.getsource(distributed_worker)
+
+        # Create a temp file that calls the worker
+        script_code = f"""
+import sys
+import json
+
+RESULTS_PATH = "{RESULTS_PATH}"
+
+{worker_src}
+
+if __name__ == "__main__":
+    distributed_worker("{quantized}", "{model}", "{kernels}", "{attn_impl}", "{mode}")
+"""
+        # Dedent for proper formatting
+        script_code = textwrap.dedent(script_code)
+
+        # Write to temp file
+        with tempfile.NamedTemporaryFile("w", suffix="_worker.py", delete=False) as tmp:
+            tmp.write(script_code)
+            tmp_path = tmp.name
+
+        # Launch torchrun
+        cmd = [
+            "torchrun",
+            f"--nproc_per_node={NUM_GPUS}",
+            tmp_path,
+        ]
+        subprocess.run(cmd, check=True)
+
+        # Cleanup
+        os.remove(tmp_path)
+
+    # ------------------------
+    # Shared parameterization
+    # ------------------------
+    PARAMETERS = [
+        (False, "20b", False, "eager", "eval"),
+        (False, "20b", False, "eager", "train"),
+        (False, "20b", False, "kernels-community/vllm-flash-attn3", "eval"),
+        (False, "20b", False, "kernels-community/vllm-flash-attn3", "train"),
+        (False, "20b", True, "eager", "eval"),
+        (False, "20b", True, "eager", "train"),
+        (False, "20b", True, "kernels-community/vllm-flash-attn3", "eval"),
+        (False, "20b", True, "kernels-community/vllm-flash-attn3", "train"),
+        (True, "20b", False, "eager", "eval"),
+        (True, "20b", False, "eager", "train"),
+        (True, "20b", False, "kernels-community/vllm-flash-attn3", "eval"),
+        (True, "20b", False, "kernels-community/vllm-flash-attn3", "train"),
+        (True, "20b", True, "eager", "eval"),
+        (True, "20b", True, "eager", "train"),
+        (True, "20b", True, "kernels-community/vllm-flash-attn3", "eval"),
+        (True, "20b", True, "kernels-community/vllm-flash-attn3", "train"),
+        (False, "120b", False, "eager", "eval"),
+        (False, "120b", False, "eager", "train"),
+        (False, "120b", False, "kernels-community/vllm-flash-attn3", "eval"),
+        (False, "120b", False, "kernels-community/vllm-flash-attn3", "train"),
+        (False, "120b", True, "eager", "eval"),
+        (False, "120b", True, "eager", "train"),
+        (False, "120b", True, "kernels-community/vllm-flash-attn3", "eval"),
+        (False, "120b", True, "kernels-community/vllm-flash-attn3", "train"),
+        (True, "120b", False, "eager", "eval"),
+        (True, "120b", False, "eager", "train"),
+        (True, "120b", False, "kernels-community/vllm-flash-attn3", "eval"),
+        (True, "120b", False, "kernels-community/vllm-flash-attn3", "train"),
+        (True, "120b", True, "eager", "eval"),
+        (True, "120b", True, "eager", "train"),
+        (True, "120b", True, "kernels-community/vllm-flash-attn3", "eval"),
+        (True, "120b", True, "kernels-community/vllm-flash-attn3", "train"),
+    ]
+
+    # ------------------------
+    # Non-distributed test
+    # ------------------------
+    @parameterized.expand(PARAMETERS)
+    @require_deterministic_for_xpu
+    def test_model_outputs(self, quantized, model, kernels, attn_impl, mode):
+        if torch_device == "cpu":
+            if attn_impl == "kernels-community/vllm-flash-attn3":
+                self.skipTest("vllm-flash-attn3 is not supported on CPU.")
+            if kernels and mode == "train":
+                self.skipTest("CPU kernels only support inference.")
+
+        if torch_device == "xpu" and attn_impl == "kernels-community/vllm-flash-attn3":
+            self.skipTest("flash attention 3 is not supported on XPU yet.")
+
+        model_id = f"openai/gpt-oss-{model}"
+        output_texts = self.load_and_forward(
+            model_id,
+            attn_impl,
+            self.input_text,
+            mode=mode,
+            use_kernels=kernels,
+        )
+
+        # Generate key to look up expected outputs
+        key = self.generate_config_key(quantized, model, kernels, attn_impl, mode)
+
+        # Load expected outputs from restructured JSON
+        if os.path.exists(RESULTS_PATH):
+            with open(RESULTS_PATH, "r") as f:
+                expected_results = json.load(f)
+
+            # Check if we have expected results for this configuration
+            if key in expected_results:
+                expected_outputs = expected_results[key]
+
+                # Compare actual outputs with expected outputs
+                self.assertEqual(len(output_texts), len(expected_outputs), f"Output length mismatch for {key}")
+
+                for i, (actual, expected) in enumerate(zip(output_texts, expected_outputs)):
+                    actual_stripped = actual.strip()
+                    expected_stripped = expected.strip()
+
+                    # Make lengths match by taking minimum length to be resilient to generation differences
+                    min_length = min(len(actual_stripped), len(expected_stripped))
+                    actual_truncated = actual_stripped[:min_length]
+                    expected_truncated = expected_stripped[:min_length]
+
+                    if actual_truncated != expected_truncated:
+                        diff = "\n".join(
+                            difflib.unified_diff(
+                                expected_truncated.splitlines(keepends=True),
+                                actual_truncated.splitlines(keepends=True),
+                                fromfile=f"expected[{i}]",
+                                tofile=f"actual[{i}]",
+                                lineterm="",
+                            )
+                        )
+                        self.fail(
+                            f"Output mismatch at index {i} for {key}:\n"
+                            f"Expected: '{expected_stripped}'\n"
+                            f"Actual:   '{actual_stripped}'\n"
+                            f"Diff (truncated to min length {min_length}):\n{diff}"
+                        )
+            else:
+                # If no expected results exist, this is a new configuration
+                # We could optionally add it to the results file here
+                print(f"Warning: No expected results found for configuration: {key}")
+
+        self.assertIsInstance(output_texts, list)
+        self.assertTrue(all(isinstance(x, str) for x in output_texts))
+
+    # ------------------------
+    # Distributed test
+    # ------------------------
+    @parameterized.expand(PARAMETERS)
+    def test_model_outputs_distributed(self, quantized, model, kernels, attn_impl, mode):
+        if torch_device == "cpu":
+            self.skipTest("Skip TP on CPU until verified.")
+
+        if torch_device == "xpu" and attn_impl == "kernels-community/vllm-flash-attn3":
+            self.skipTest("flash attention 3 is not supported on XPU yet.")
+
+        self.run_distributed_test(quantized, model, kernels, attn_impl, mode)
+
+    # ------------------------
+    # Training test
+    # ------------------------
+    @parameterized.expand(PARAMETERS)
+    def test_training_step(self, quantized, model, kernels, attn_impl, mode):
+        if torch_device == "cpu":
+            if attn_impl == "kernels-community/vllm-flash-attn3":
+                self.skipTest("vllm-flash-attn3 is not supported on CPU.")
+            if kernels and mode == "train":
+                self.skipTest("CPU kernels only support inference.")
+
+        if mode != "train":
+            self.skipTest("This test is only for training mode.")
+
+        if quantized:
+            self.skipTest("Training test for quantized models is not supported.")
+
+        model_id = f"openai/gpt-oss-{model}"
+
+        model_obj = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            dtype=torch.bfloat16,
+            device_map="auto",
+            attn_implementation=attn_impl,
+            use_kernels=kernels,
+        )
+        model_obj.train()
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(model_obj.device)
+        inputs["labels"] = inputs["input_ids"].clone()
+
+        outputs = model_obj(**inputs)
+        loss = outputs.loss
+        self.assertIsNotNone(loss)
+
+        loss.backward()
+
+        # Check that gradients were computed for all parameters that have a grad field
+        for name, param in model_obj.named_parameters():
+            if param.requires_grad:
+                self.assertIsNotNone(param.grad, f"Parameter '{name}' did not receive a gradient.")
+                # Check that gradients are not all zero
+                self.assertTrue(
+                    torch.sum(torch.abs(param.grad)).item() > 0, f"Gradient for parameter '{name}' is all zeros."
+                )
+
+    def test_model_matches_original_20b(self):
+        input_text = "Roses are red, violets"
+
+        original_output = "Roses are red, violets are blue, I love you, and I love you too."
+        original_logprobs = torch.tensor(
+            [
+                -0.037353515625,
+                -0.08154296875,
+                -1.21875,
+                -1.953125,
+                -2.234375,
+                -0.96875,
+                -1.546875,
+                -1.640625,
+                -0.93359375,
+                -1.609375,
+                -1.625,
+                -0.85546875,
+                -1.7265625,
+                -0.7421875,
+                -2.078125,
+                -0.006561279296875,
+                -0.10498046875,
+                -0.1767578125,
+                -0.1240234375,
+                -0.099609375,
+            ]
+        )
+
+        model_id = "openai/gpt-oss-20b"
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            dtype=torch.bfloat16,
+            device_map="auto",
+            attn_implementation="eager",
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer(input_text)["input_ids"]
+
+        num_generated_tokens = 0
+        with torch.no_grad():
+            for i in range(12):
+                tensors = torch.as_tensor(tokens, dtype=torch.int32, device=model.device).unsqueeze(0)
+                logits = model(tensors).logits[0]
+
+                predicted_token = torch.argmax(logits[-1, :], dim=-1).item()
+                logprobs = torch.log_softmax(logits[-1, :], dim=-1)
+                selected_logprobs = logprobs[predicted_token]
+
+                tokens.append(predicted_token)
+                num_generated_tokens += 1
+                decoded_token = tokenizer.decode([predicted_token])
+                logprob_differences = selected_logprobs - original_logprobs[i]
+
+                print(
+                    f"Generated token: {repr(decoded_token)}, logprob: {selected_logprobs}, logprob differences: {logprob_differences}"
+                )
+                torch.testing.assert_close(
+                    selected_logprobs.cpu().to(original_logprobs.dtype), original_logprobs[i], atol=1e-1, rtol=1e-1
+                )
+
+        decoded_string = tokenizer.decode(tokens)
+        self.assertTrue(original_output.startswith(decoded_string))
+
+    def test_model_matches_original_120b(self):
+        input_text = "Roses are red, violets"
+
+        original_output = """Roses are red, violets are blue,
+I am a language model, not a human being"""
+        original_logprobs = torch.tensor(
+            [
+                -0.90234375,
+                -0.66015625,
+                -1.546875,
+                -2.703125,
+                -2.078125,
+                -1.21875,
+                -2.484375,
+                -0.031982421875,
+                -0.84765625,
+                -1.890625,
+                -0.1923828125,
+                -2.046875,
+                -1.65625,
+                -1.3515625,
+                -1.1640625,
+                -0.3671875,
+                -1.9921875,
+                -1.5390625,
+                -1.46875,
+                -0.85546875,
+            ]
+        )
+
+        model_id = "openai/gpt-oss-120b"
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            dtype=torch.bfloat16,
+            device_map="auto",
+            attn_implementation="eager",
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer(input_text)["input_ids"]
+
+        num_generated_tokens = 0
+        with torch.no_grad():
+            for i in range(12):
+                tensors = torch.as_tensor(tokens, dtype=torch.int32, device=model.device).unsqueeze(0)
+                logits = model(tensors).logits[0]
+
+                predicted_token = torch.argmax(logits[-1, :], dim=-1).item()
+                logprobs = torch.log_softmax(logits[-1, :], dim=-1)
+                selected_logprobs = logprobs[predicted_token]
+
+                tokens.append(predicted_token)
+                num_generated_tokens += 1
+                decoded_token = tokenizer.decode([predicted_token])
+                logprob_differences = selected_logprobs - original_logprobs[i]
+
+                print(
+                    f"Generated token: {repr(decoded_token)}, logprob: {selected_logprobs}, logprob differences: {logprob_differences}"
+                )
+                torch.testing.assert_close(
+                    selected_logprobs.cpu().to(original_logprobs.dtype), original_logprobs[i], atol=1e-1, rtol=1e-1
+                )
+
+        decoded_string = tokenizer.decode(tokens)
+        self.assertTrue(original_output.startswith(decoded_string))