first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/blt/init.py
+++ b/tests/models/blt/init.py
--- a/tests/models/blt/test_modeling_blt.py
+++ b/tests/models/blt/test_modeling_blt.py
@@ -0,0 +1,423 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Blt model."""
+
+import unittest
+
+import pytest
+from parameterized import parameterized
+
+from transformers import AutoTokenizer, is_torch_available
+from transformers.testing_utils import (
+    Expectations,
+    cleanup,
+    require_torch,
+    require_torch_accelerator,
+    require_torch_bf16,
+    slow,
+    torch_device,
+)
+
+from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    _test_eager_matches_sdpa_inference,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import BltConfig, BltForCausalLM, BltModel
+
+
+class BltModelTester(CausalLMModelTester):
+    if is_torch_available():
+        base_model_class = BltModel
+
+    def __init__(
+        self,
+        parent,
+        ignore_index=-100,
+        seq_length=7,
+        is_training=True,
+    ):
+        super().__init__(parent)
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.batch_size = 3
+
+        # Common parameters for all configs
+        self.hidden_size = 16
+        self.num_hidden_layers = 1
+        self.num_attention_heads = 2
+        self.num_key_value_heads = 2
+        self.intermediate_size = 32
+        self.hidden_act = "silu"
+        self.max_position_embeddings = 32
+        self.vocab_size = 32
+        self.rope_theta = 500000.0
+        self.rope_parameters = {"rope_type": "default"}
+        self.rms_norm_eps = 1e-5
+        self.dropout = 0.0
+        self.encoder_hash_byte_group_size = [2, 3]
+        self.encoder_hash_byte_group_vocab = 64
+        self.encoder_hash_byte_group_nb_functions = 1
+        # Common parameters for all configs
+        self.patcher_config = {
+            "hidden_size": self.hidden_size,
+            "num_hidden_layers": self.num_hidden_layers,
+            "num_attention_heads": self.num_attention_heads,
+            "num_key_value_heads": self.num_key_value_heads,
+            "intermediate_size": self.intermediate_size,
+            "max_position_embeddings": self.max_position_embeddings,
+            "rope_theta": self.rope_theta,
+            "rope_parameters": self.rope_parameters,
+            "hidden_act": self.hidden_act,
+            "rms_norm_eps": self.rms_norm_eps,
+            "dropout": self.dropout,
+        }
+
+        self.encoder_config = {
+            "hidden_size": self.hidden_size,
+            "num_hidden_layers": self.num_hidden_layers,
+            "num_attention_heads": self.num_attention_heads,
+            "num_key_value_heads": self.num_key_value_heads,
+            "intermediate_size": self.intermediate_size,
+            "max_position_embeddings": self.max_position_embeddings,
+            "rope_theta": self.rope_theta,
+            "rope_parameters": self.rope_parameters,
+            "hidden_act": self.hidden_act,
+            "rms_norm_eps": self.rms_norm_eps,
+            "dropout": self.dropout,
+        }
+
+        self.decoder_config = {
+            "vocab_size": self.vocab_size,
+            "hidden_size": self.hidden_size,
+            "hidden_size_global": self.hidden_size * 2,  # Must match global transformer output size
+            "num_hidden_layers": self.num_hidden_layers,
+            "num_attention_heads": self.num_attention_heads,
+            "num_key_value_heads": self.num_key_value_heads,
+            "intermediate_size": self.intermediate_size,
+            "max_position_embeddings": self.max_position_embeddings,
+            "rope_theta": self.rope_theta,
+            "rope_parameters": self.rope_parameters,
+            "hidden_act": self.hidden_act,
+            "rms_norm_eps": self.rms_norm_eps,
+            "dropout": self.dropout,
+        }
+
+        self.global_config = {
+            "hidden_size": self.hidden_size * 2,  # Double the hidden size for global transformer
+            "num_hidden_layers": self.num_hidden_layers,
+            "num_attention_heads": self.num_attention_heads,
+            "num_key_value_heads": self.num_key_value_heads,
+            "intermediate_size": self.intermediate_size,
+            "max_position_embeddings": self.max_position_embeddings,
+            "rope_theta": self.rope_theta,
+            "rope_parameters": self.rope_parameters,
+            "hidden_act": self.hidden_act,
+            "rms_norm_eps": self.rms_norm_eps,
+            "dropout": self.dropout,
+        }
+
+        self.num_hidden_layers = self.encoder_config["num_hidden_layers"]
+
+    def get_config(self):
+        config = BltConfig(
+            vocab_size=self.vocab_size,
+            max_position_embeddings=self.max_position_embeddings,
+            patch_in_forward=False,  # Disable patching for tests
+            patch_size=4,
+            patching_mode="entropy",
+            patching_threshold=1.335442066192627,
+            patching_batch_size=1,
+            max_patch_length=None,
+            cross_attn_k=2,
+            encoder_hash_byte_group_size=self.encoder_hash_byte_group_size,
+            encoder_hash_byte_group_vocab=self.encoder_hash_byte_group_vocab,
+            encoder_hash_byte_group_nb_functions=self.encoder_hash_byte_group_nb_functions,
+            patcher_config=self.patcher_config,
+            encoder_config=self.encoder_config,
+            decoder_config=self.decoder_config,
+            global_config=self.global_config,
+            rope_parameters=self.rope_parameters,
+            tie_word_embeddings=False,
+        )
+
+        config.num_attention_heads = config.decoder_config.num_attention_heads
+        config.num_hidden_layers = config.encoder_config.num_hidden_layers
+        config.hidden_size = config.decoder_config.hidden_size
+
+        return config
+
+
+@require_torch
+class BltModelTest(CausalLMModelTest, unittest.TestCase):
+    model_tester_class = BltModelTester
+
+    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
+    # This is because we are hitting edge cases with the causal_mask buffer
+    model_split_percents = [0.5, 0.7, 0.8]
+
+    # used in `test_torch_compile_for_training`
+    _torch_compile_train_cls = BltForCausalLM if is_torch_available() else None
+
+    @pytest.mark.generate
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    @unittest.skip(
+        "Blt requires real token IDs for its hash-based embedding computation, making inputs_embeds generation incompatible with identical outputs"
+    )
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        pass
+
+    @pytest.mark.generate
+    def test_generate_with_quant_cache(self):
+        self.skipTest("BLT uses EncoderDecoderCache internally and does not support quantized cache")
+
+    @pytest.mark.generate
+    @unittest.skip(
+        "Blt requires real token IDs for its hash-based embedding computation, making inputs_embeds generation incompatible with identical outputs"
+    )
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    def test_eager_matches_sdpa_inference(
+        self,
+        name,
+        torch_dtype,
+        padding_side,
+        use_attention_mask,
+        output_attentions,
+        enable_kernels,
+    ):
+        "We need to relax a bit the `atols` for fp32 here due to the altup projections"
+        atols = {
+            ("cpu", False, torch.float32): 2e-2,  # this was relaxed
+            ("cpu", False, torch.float16): 5e-3,
+            ("cpu", False, torch.bfloat16): 1e-2,
+            ("cpu", True, torch.float32): 2e-2,  # this was relaxed
+            ("cpu", True, torch.float16): 5e-3,
+            ("cpu", True, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float32): 2e-2,  # this was relaxed
+            ("cuda", False, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float16): 5e-3,
+            ("cuda", True, torch.float32): 2e-2,  # this was relaxed
+            ("cuda", True, torch.bfloat16): 1e-2,
+            ("cuda", True, torch.float16): 5e-3,
+        }
+        _test_eager_matches_sdpa_inference(
+            self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels, atols=atols
+        )
+
+    @require_torch_accelerator
+    @slow
+    def test_sdpa_can_dispatch_on_flash(self):
+        self.skipTest("BLT always has an attention_mask input")
+
+
+@require_torch_accelerator
+class BltIntegrationTest(unittest.TestCase):
+    def setup(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def tearDown(self):
+        # TODO (joao): automatic compilation, i.e. compilation when `cache_implementation="static"` is used, leaves
+        # some memory allocated in the cache, which means some object is not being released properly. This causes some
+        # unoptimal memory usage, e.g. after certain tests a 7B model in FP16 no longer fits in a 24GB GPU.
+        # Investigate the root cause.
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    def test_model(self):
+        NUM_TOKENS_TO_GENERATE = 200
+        EXPECTED_TEXT = "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s"
+
+        prompt = "my name is"
+
+        model = BltForCausalLM.from_pretrained("itazap/blt-1b-hf", device_map="auto", attn_implementation="sdpa")
+
+        tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
+
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
+        )
+
+        output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(output_text, EXPECTED_TEXT)
+
+    @slow
+    def test_model_logits(self):
+        # fmt: off
+        EXPECTED_OUTPUT = Expectations(
+            {
+                (None, None): torch.tensor(
+                    [
+                        [-10.5000, -10.6875, -6.2500, -10.5625, -10.3125, -9.1875, -8.5000, -8.5625, -9.1875, -9.6250, -9.3750, -8.5000, -9.1250, -3.3906, 2.9688, -10.3125, -6.4688, -6.0312, -9.7500, -9.1875, -8.8125, -9.8750, -9.8125, -9.5000, -9.8125, -9.5000, -9.0625, -9.8125, -9.5000, -9.3750],
+                        [-13.2500, -13.1250, -5.6875, -13.1875, -13.3750, -8.6875, -6.9688, -6.9375, -10.0625, -10.3125, -9.8125, -7.7188, -8.8125, -5.2188, -3.5000, -12.4375, -9.0625, -6.6250, -10.3125, -9.1875, -10.6250, -11.4375, -11.1250, -10.8750, -10.5000, -10.8750, -11.0000, -11.3125, -10.5000, -9.8750],
+                    ]
+                ),
+                ("xpu", None): torch.tensor(
+                    [
+                        [-10.4375, -10.6875, -6.1875, -10.5000, -10.3125, -9.1250, -8.4375, -8.6250, -9.1875, -9.5625, -9.3125, -8.4375, -9.0625, -3.4375, 2.9531, -10.2500, -6.4062, -6.0000, -9.6875, -9.1875, -8.8125, -9.8125, -9.7500, -9.4375, -9.7500, -9.4375, -9.0000, -9.8125, -9.4375, -9.3125],
+                        [-13.3125, -13.2500, -5.5938, -13.3125, -13.5000, -8.7500, -7.0625, -7.0312, -10.1875, -10.3750, -9.9375, -7.8438, -8.8750, -5.3438, -3.5938, -12.5625, -9.2500, -6.8125, -10.3750, -9.3125, -10.6875, -11.5625, -11.3125, -11.0000, -10.6250, -10.9375, -11.0625, -11.3750, -10.5625, -10.0000],
+                    ]
+                ),
+            }
+        ).get_expectation()
+        EXPECTED_OUTPUT = EXPECTED_OUTPUT.to(torch_device)
+        # fmt: on
+
+        input_ids = [1, 42, 21, 12, 43, 23, 1, 4]
+
+        model = BltForCausalLM.from_pretrained("itazap/blt-1b-hf", attn_implementation="sdpa", device_map="auto")
+
+        with torch.no_grad():
+            output = model(torch.tensor([input_ids]).to(torch_device))[0]
+
+        torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30].to(torch_device), rtol=1e-3, atol=1e-3)
+
+    @slow
+    @require_torch_bf16
+    def test_model_bf16(self):
+        """Test Blt model with bfloat16 precision."""
+        NUM_TOKENS_TO_GENERATE = 200
+        # fmt: off
+        EXPECTED_TEXT = Expectations(
+            {
+                (None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
+                ("xpu", None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
+            }
+        )
+        # fmt: on
+
+        prompt = "my name is"
+
+        model = BltForCausalLM.from_pretrained(
+            "itazap/blt-1b-hf", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.bfloat16
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
+
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
+        )
+
+        output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())
+
+    @slow
+    @require_torch_bf16
+    def test_model_logits_bf16(self):
+        """Test Blt model logits with bfloat16 precision."""
+
+        # fmt: off
+        EXPECTED_OUTPUT = Expectations(
+            {
+                (None, None): torch.tensor(
+                    [
+                        [-10.5000, -10.6875, -6.2500, -10.5625, -10.3125, -9.1875, -8.5000, -8.5625, -9.1875, -9.6250, -9.3750, -8.5000, -9.1250, -3.3906, 2.9688, -10.3125, -6.4688, -6.0312, -9.7500, -9.1875, -8.8125, -9.8750, -9.8125, -9.5000, -9.8125, -9.5000, -9.0625, -9.8125, -9.5000, -9.3750],
+                        [-13.2500, -13.1250, -5.6875, -13.1875, -13.3750, -8.6875, -6.9688, -6.9375, -10.0625, -10.3125, -9.8125, -7.7188, -8.8125, -5.2188, -3.5000, -12.4375, -9.0625, -6.6250, -10.3125, -9.1875, -10.6250, -11.4375, -11.1250, -10.8750, -10.5000, -10.8750, -11.0000, -11.3125, -10.5000, -9.8750],
+                    ]
+                ),
+                ("xpu", None): torch.tensor(
+                    [
+                        [-10.4375, -10.6875, -6.1875, -10.5000, -10.3125, -9.1250, -8.4375, -8.6250, -9.1875, -9.5625, -9.3125, -8.4375, -9.0625, -3.4375, 2.9531, -10.2500, -6.4062, -6.0000, -9.6875, -9.1875, -8.8125, -9.8125, -9.7500, -9.4375, -9.7500, -9.4375, -9.0000, -9.8125, -9.4375, -9.3125],
+                        [-13.3125, -13.2500, -5.5938, -13.3125, -13.5000, -8.7500, -7.0625, -7.0312, -10.1875, -10.3750, -9.9375, -7.8438, -8.8750, -5.3438, -3.5938, -12.5625, -9.2500, -6.8125, -10.3750, -9.3125, -10.6875, -11.5625, -11.3125, -11.0000, -10.6250, -10.9375, -11.0625, -11.3750, -10.5625, -10.0000],
+                    ]
+                ),
+            }
+        ).get_expectation()
+        EXPECTED_OUTPUT = EXPECTED_OUTPUT.to(torch_device)
+        # fmt: on
+
+        input_ids = [1, 42, 21, 12, 43, 23, 1, 4]
+
+        model = BltForCausalLM.from_pretrained(
+            "itazap/blt-1b-hf", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.bfloat16
+        )
+
+        with torch.no_grad():
+            output = model(torch.tensor([input_ids]).to(torch_device))[0]
+
+        torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30].to(torch_device), rtol=1e-3, atol=1e-3)
+
+    @slow
+    def test_model_eager(self):
+        """Test Blt model with bfloat16 precision using eager attention implementation."""
+        NUM_TOKENS_TO_GENERATE = 200
+        # fmt: off
+        EXPECTED_TEXT = Expectations(
+            {
+                (None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
+                ("xpu", None): "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m",
+            }
+        )
+        # fmt: on
+
+        prompt = "my name is"
+
+        model = BltForCausalLM.from_pretrained("itazap/blt-1b-hf", device_map="auto", attn_implementation="eager")
+
+        tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
+
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
+        )
+
+        output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())
+
+    @slow
+    @require_torch_bf16
+    def test_model_bf16_static_cache(self):
+        """Test Blt model with bfloat16 precision and static cache."""
+        NUM_TOKENS_TO_GENERATE = 200
+        # fmt: off
+        EXPECTED_TEXT = Expectations(
+            {
+                (None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
+                ("xpu", None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
+            }
+        )
+        # fmt: on
+
+        prompt = "my name is"
+
+        model = BltForCausalLM.from_pretrained(
+            "itazap/blt-1b-hf", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.bfloat16
+        )
+
+        model.generation_config.cache_implementation = "static"
+
+        tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
+
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
+        )
+
+        output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())