first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/glm4_moe_lite/init.py
+++ b/tests/models/glm4_moe_lite/init.py
--- a/tests/models/glm4_moe_lite/test_modeling_glm4_moe_lite.py
+++ b/tests/models/glm4_moe_lite/test_modeling_glm4_moe_lite.py
@@ -0,0 +1,131 @@
+# Copyright 2025 the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch GLM-4-MoE-Lite model."""
+
+import unittest
+
+import pytest
+import torch
+
+from transformers import Cache, is_torch_available
+from transformers.testing_utils import (
+    cleanup,
+    require_torch,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+
+from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
+
+
+if is_torch_available():
+    from transformers import AutoTokenizer, Glm4MoeLiteForCausalLM, Glm4MoeLiteModel
+
+
+class Glm4MoeLiteModelTester(CausalLMModelTester):
+    if is_torch_available():
+        base_model_class = Glm4MoeLiteModel
+
+    def __init__(
+        self,
+        parent,
+        n_routed_experts=8,
+        kv_lora_rank=32,
+        q_lora_rank=16,
+        qk_nope_head_dim=64,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+    ):
+        super().__init__(parent=parent)
+        self.n_routed_experts = n_routed_experts
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+
+
+@require_torch
+class Glm4MoeModelTest(CausalLMModelTest, unittest.TestCase):
+    model_tester_class = Glm4MoeLiteModelTester
+    test_all_params_have_gradient = False
+    model_split_percents = [0.5, 0.7, 0.8]
+
+    @unittest.skip("MoE topk routing is too sensitive to Float8 quantization numerical noise")
+    def test_tp_generation_quantized(self):
+        pass
+
+    def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config):
+        """Needs to be overridden as GLM-4.7-Flash has special MLA cache format (though we don't really use the MLA)"""
+        self.assertIsInstance(past_key_values, Cache)
+
+        # (batch, head, seq_length, head_features)
+        expected_common_shape = (
+            batch_size,
+            getattr(config, "num_key_value_heads", config.num_attention_heads),
+            seq_length,
+        )
+        expected_key_shape = expected_common_shape + (config.qk_nope_head_dim + config.qk_rope_head_dim,)
+        expected_value_shape = expected_common_shape + (config.v_head_dim,)
+
+        for layer in past_key_values.layers:
+            self.assertEqual(layer.keys.shape, expected_key_shape)
+            self.assertEqual(layer.values.shape, expected_value_shape)
+
+
+@require_torch_accelerator
+@slow
+class Glm4MoeIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
+    @slow
+    @require_torch_accelerator
+    @pytest.mark.torch_compile_test
+    def test_compile_static_cache(self):
+        NUM_TOKENS_TO_GENERATE = 40
+        EXPECTED_TEXT_COMPLETION = [
+            'hello, world!\'\'\')\nprint(\'hello, world!\')\nprint("hello, world!")\nprint("hello, world!")\nprint("hello, world!")\nprint("hello, world!")\nprint("hello, world!")\n',
+            "tell me the story of the first Thanksgiving. commonly known as the Pilgrims, arrived in the autumn of 1620. They were seeking religious freedom and a new life in the Plymouth Colony. Their first",
+        ]
+
+        prompts = ["[gMASK]<sop>hello", "[gMASK]<sop>tell me"]
+        tokenizer = AutoTokenizer.from_pretrained("zai-org/GLM-4.7-Flash")
+        model = Glm4MoeLiteForCausalLM.from_pretrained(
+            "zai-org/GLM-4.7-Flash", device_map=torch_device, dtype=torch.bfloat16
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+
+        # Dynamic Cache
+        generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False)
+        dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text)
+
+        # Static Cache
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
+        )
+        static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)
+
+        # Static Cache + compile
+        model._cache = None  # clear cache object, initialized when we pass `cache_implementation="static"`
+        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
+        )
+        static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)