first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/glm_image/init.py
+++ b/tests/models/glm_image/init.py
--- a/tests/models/glm_image/test_modeling_glm_image.py
+++ b/tests/models/glm_image/test_modeling_glm_image.py
@@ -0,0 +1,637 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch GLM-Image model."""
+
+import unittest
+
+import pytest
+from parameterized import parameterized
+
+from transformers import (
+    GlmImageConfig,
+    GlmImageForConditionalGeneration,
+    GlmImageModel,
+    GlmImageProcessor,
+    is_torch_available,
+    set_seed,
+)
+from transformers.models.auto import get_values
+from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
+from transformers.testing_utils import (
+    Expectations,
+    cleanup,
+    require_deterministic_for_xpu,
+    require_flash_attn,
+    require_torch,
+    require_torch_accelerator,
+    run_first,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+class GlmImageVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        num_channels=3,
+        ignore_index=-100,
+        image_size=128,
+        image_start_token_id=50,
+        image_end_token_id=51,
+        image_token_id=52,
+        is_training=True,
+        text_config={
+            "vocab_size": 99,
+            "vision_vocab_size": 99,
+            "hidden_size": 16,
+            "intermediate_size": 22,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 2,
+            "num_key_value_heads": 1,
+            "output_channels": 64,
+            "hidden_act": "silu",
+            "max_position_embeddings": 512,
+            "rope_parameters": {"type": "default", "mrope_section": [2, 1, 1]},
+            "rope_theta": 10000,
+            "tie_word_embeddings": True,
+            "bos_token_id": 0,
+            "eos_token_id": 0,
+            "pad_token_id": 0,
+            "n_routed_experts": 8,
+            "n_shared_experts": 1,
+            "n_group": 1,
+            "topk_group": 1,
+            "num_experts_per_tok": 8,
+        },
+        vision_config={
+            "depth": 2,
+            "hidden_act": "gelu",
+            "hidden_size": 32,
+            "intermediate_size": 22,
+            "patch_size": 16,
+            "spatial_merge_size": 1,
+            "temporal_patch_size": 1,
+        },
+        vq_config={
+            "embed_dim": 48,
+            "in_channels": 3,
+            "initializer_range": 0.02,
+            "latent_channels": 32,
+            "num_embeddings": 32,
+        },
+    ):
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.bos_token_id = text_config["bos_token_id"]
+        self.eos_token_id = text_config["eos_token_id"]
+        self.pad_token_id = text_config["pad_token_id"]
+        self.image_start_token_id = image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.image_token_id = image_token_id
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.vq_config = vq_config
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.is_training = is_training
+        self.hidden_size = text_config["hidden_size"]
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.vision_vocab_size = text_config["vision_vocab_size"]
+        self.vocab_size = text_config["vocab_size"]
+        self.num_image_tokens = 64
+        self.seq_length = seq_length + self.num_image_tokens
+        self.n_routed_experts = text_config["n_routed_experts"]
+        self.n_shared_experts = text_config["n_shared_experts"]
+        self.num_experts_per_tok = text_config["num_experts_per_tok"]
+        self.n_group = text_config["n_group"]
+        self.topk_group = text_config["topk_group"]
+
+    def get_config(self):
+        return GlmImageConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            vq_config=self.vq_config,
+            image_token_id=self.image_token_id,
+            image_start_token_id=self.image_start_token_id,
+            image_end_token_id=self.image_end_token_id,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        patch_size = config.vision_config.patch_size
+        temporal_patch_size = config.vision_config.temporal_patch_size
+        pixel_values = floats_tensor(
+            [
+                self.batch_size * (self.image_size**2) // (patch_size**2),
+                self.num_channels * (patch_size**2) * temporal_patch_size,
+            ]
+        )
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        input_ids[input_ids == self.image_token_id] = self.pad_token_id
+        input_ids[input_ids == self.image_start_token_id] = self.pad_token_id
+        input_ids[input_ids == self.image_end_token_id] = self.pad_token_id
+
+        input_ids[:, 0] = self.image_start_token_id
+        input_ids[:, 1 : 1 + self.num_image_tokens] = self.image_token_id
+        input_ids[:, 1 + self.num_image_tokens] = self.image_end_token_id
+        patch_size = config.vision_config.patch_size
+        patches_per_side = self.image_size // patch_size
+
+        # For i2i mode: each sample has 1 source image + 1 target grid
+        # image_grid_thw layout: [sample0_source, sample0_target, sample1_source, sample1_target, ...]
+        # Since batches are homogeneous, all samples have same number of source images
+        num_grids_per_sample = 2  # 1 source + 1 target
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "image_grid_thw": torch.tensor(
+                [[1, patches_per_side, patches_per_side]] * (self.batch_size * num_grids_per_sample),
+                device=torch_device,
+            ),
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "images_per_sample": torch.tensor([num_grids_per_sample] * self.batch_size, device=torch_device),
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class GlmImageModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (GlmImageModel, GlmImageForConditionalGeneration) if is_torch_available() else ()
+
+    model_split_percents = [0.7, 0.9]  # model too big to split at 0.5
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = GlmImageVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GlmImageConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # GlmImage has images shaped as (bs*patch_len, dim) so we can't slice to batches in generate
+    def prepare_config_and_inputs_for_generate(self, batch_size=2):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # We don't want a few model inputs in our model input dictionary for generation tests
+        input_keys_to_ignore = [
+            # we don't want to mask attention heads
+            # we don't want encoder-decoder models to start from filled decoder ids
+            "decoder_input_ids",
+            "decoder_attention_mask",
+            # we'll set cache use in each test differently
+            "use_cache",
+            # Ignore labels if it is in the input dict
+            "labels",
+            # model-specific exceptions should overload/overwrite this function
+        ]
+
+        # The diff from the general `prepare_config_and_inputs_for_generate` lies here
+        patch_size = config.vision_config.patch_size
+        num_patches_per_image = (self.model_tester.image_size**2) // (patch_size**2)
+        num_grids_per_sample = 2  # 1 source + 1 target
+
+        filtered_inputs_dict = {
+            k: v[:batch_size, ...]
+            if isinstance(v, torch.Tensor) and k not in ["pixel_values", "image_grid_thw", "images_per_sample"]
+            else v
+            for k, v in inputs_dict.items()
+            if k not in input_keys_to_ignore
+        }
+        # pixel_values: each sample has 1 source image
+        filtered_inputs_dict["pixel_values"] = inputs_dict["pixel_values"][: batch_size * num_patches_per_image]
+        # image_grid_thw: each sample has 2 grids (1 source + 1 target)
+        filtered_inputs_dict["image_grid_thw"] = inputs_dict["image_grid_thw"][: batch_size * num_grids_per_sample]
+        # images_per_sample: each sample has 2 images
+        filtered_inputs_dict["images_per_sample"] = torch.tensor(
+            [num_grids_per_sample] * batch_size, device=torch_device
+        )
+
+        # It is important set `eos_token_id` to `None` to avoid early stopping (would break for length-based checks)
+        text_gen_config = config.get_text_config(decoder=True)
+        if text_gen_config.eos_token_id is not None and text_gen_config.pad_token_id is None:
+            text_gen_config.pad_token_id = (
+                text_gen_config.eos_token_id
+                if isinstance(text_gen_config.eos_token_id, int)
+                else text_gen_config.eos_token_id[0]
+            )
+        text_gen_config.eos_token_id = None
+        text_gen_config.forced_eos_token_id = None
+
+        return config, filtered_inputs_dict
+
+    def test_training(self):
+        # Model isn't in any auto-mapping so we need to build labels manually
+        if not self.model_tester.is_training:
+            self.skipTest(reason="ModelTester is not configured to run training tests")
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            if model_class.__name__ in [
+                *get_values(MODEL_MAPPING_NAMES),
+            ]:
+                continue
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs_dict["labels"] = torch.zeros(
+                (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+            )
+            loss = model(**inputs_dict).loss
+            loss.backward()
+
+    @unittest.skip(reason="Reequires input ids AND image grid to generate")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @unittest.skip("Needs special input preparation. Not important test for model, skip for now")
+    def test_eager_matches_sdpa_inference(
+        self, name, dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
+    ):
+        pass
+
+    @unittest.skip(reason="No available kernels - not supported")
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+    @unittest.skip(reason="Size mismatch")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @pytest.mark.xfail(
+        reason="GlmImage has a VQ module that uses `weight.data` directly in forward which prevent offloading on that module"
+    )
+    def test_disk_offload_safetensors(self):
+        pass
+
+    @pytest.mark.xfail(
+        reason="GlmImage has a VQ module that uses `weight.data` directly in forward which prevent offloading on that module"
+    )
+    def test_disk_offload_bin(self):
+        pass
+
+    @pytest.mark.xfail(
+        reason="GlmImage has a VQ module that uses `weight.data` directly in forward which prevent offloading on that module"
+    )
+    def test_cpu_offload(self):
+        pass
+
+    @pytest.mark.xfail(
+        reason="GlmImage has a VQ module that uses `weight.data` directly in forward which prevent offloading on that module"
+    )
+    def test_model_parallelism(self):
+        pass
+
+    @unittest.skip("Error with compilation")
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    @unittest.skip(reason="GLM-Image does not use inputs_embeds")
+    def test_generate_from_inputs_embeds(self, _, num_beams):
+        pass
+
+    @unittest.skip(reason="GLM-Image input embed is compare with inputs_ids and image_ids")
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @unittest.skip(reason="GLM-Image does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="GLM-Image can't do text-only inference")
+    def test_generate_from_random_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="GLM-Image can't do and does not need assisted generation. Not worth fixing!")
+    def test_assisted_decoding_sample(self):
+        pass
+
+    @unittest.skip(reason="GLM-Image can't do and does not need assisted generation. Not worth fixing!")
+    def test_prompt_lookup_decoding_matches_greedy_search(self):
+        pass
+
+    @parameterized.expand([("random",), ("same",)])
+    @unittest.skip(reason="GLM-Image can't do and does not need assisted generation. Not worth fixing!")
+    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @unittest.skip(reason="GlmImageVisionModel does not support training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="GlmImageVision does not support output_hidden_states test")
+    def test_model_outputs_equivalence(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_true(self):
+        pass
+
+    @unittest.skip(reason="GlmImageVisionModel does not support training")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="GlmImage needs special input preparation to pass this test")
+    def test_generate_compile_model_forward_fullgraph(self):
+        pass
+
+    @unittest.skip(
+        reason="GlmImage is a multimodal model that requires pixel_values and image_grid_thw. "
+        "This test drops all inputs except input_ids which causes NoneType iteration error."
+    )
+    def test_flash_attention_2_continue_generate_with_position_ids(self):
+        pass
+
+    @unittest.skip(
+        reason="GlmImage is a multimodal model that requires pixel_values and image_grid_thw. "
+        "This test only uses input_ids and attention_mask which causes NoneType iteration error."
+    )
+    def test_flash_attn_2_fp32_ln(self):
+        pass
+
+    @unittest.skip(
+        reason="GlmImage is a multimodal model that requires pixel_values and image_grid_thw. "
+        "This test only uses input_ids and attention_mask which causes NoneType iteration error."
+    )
+    def test_flash_attn_2_from_config(self):
+        pass
+
+    def _image_features_prepare_config_and_inputs(self):
+        """
+        Helper method to extract only image-related inputs from the full set of inputs, for testing `get_image_features`.
+
+        GlmImage internally preprocesses the image_grid_thw input by selecting source grids,
+        so we need to prepare inputs accordingly for testing get_image_features. We also discard text-related inputs.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # Select only source grids (every other grid starting from index 0)
+        # Grid layout: [s0_source, s0_target, s1_source, s1_target, ...]
+        num_grids_per_sample = 2  # 1 source + 1 target
+        batch_size = self.model_tester.batch_size
+        source_indices = [i * num_grids_per_sample for i in range(batch_size)]
+        inputs_dict["image_grid_thw"] = inputs_dict["image_grid_thw"][source_indices]
+        del inputs_dict["input_ids"]
+        del inputs_dict["attention_mask"]
+        return config, inputs_dict
+
+
+@require_torch
+@slow
+class GlmImageIntegrationTest(unittest.TestCase):
+    model_id = "zai-org/GLM-Image"
+    model_subfolder = "vision_language_encoder"
+    processor_subfolder = "processor"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = None
+
+    @classmethod
+    def get_model(cls):
+        if cls.model is None:
+            cls.model = GlmImageForConditionalGeneration.from_pretrained(
+                cls.model_id, subfolder=cls.model_subfolder, torch_dtype=torch.bfloat16, device_map="auto"
+            )
+        return cls.model
+
+    @classmethod
+    def tearDownClass(cls):
+        if hasattr(cls, "model"):
+            del cls.model
+        cleanup(torch_device, gc_collect=True)
+
+    def setUp(self):
+        cleanup(torch_device, gc_collect=True)
+        self.processor = GlmImageProcessor.from_pretrained(self.model_id, subfolder=self.processor_subfolder)
+        # Text-to-image generation message
+        self.t2i_message = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "A cute cat sitting on a wooden table"},
+                ],
+            }
+        ]
+        # Image-to-image generation message
+        self.i2i_message = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+                    },
+                    {"type": "text", "text": "Add a red hat to this cat"},
+                ],
+            }
+        ]
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def test_processor_text_to_image(self):
+        """Test processor correctly prepares text-to-image inputs."""
+        inputs = self.processor.apply_chat_template(
+            self.t2i_message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
+        )
+        # For T2I with apply_chat_template, we get basic text inputs
+        # Target grids are added during actual generation when using processor directly with target shape
+        self.assertIn("input_ids", inputs)
+        self.assertIn("attention_mask", inputs)
+
+    def test_processor_image_to_image(self):
+        """Test processor correctly prepares image-to-image inputs."""
+        from io import BytesIO
+
+        import requests
+        from PIL import Image
+
+        # Load the image
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content))
+
+        # Create prompt with target shape and image token
+        text = "<|dit_token_16384|><|image|><|dit_token_16385|>Add a red hat to this cat<sop>28 40<eop>"
+
+        # Process with actual images (nested list for batched processing)
+        inputs = self.processor(text=[text], images=[[image]], return_tensors="pt")
+
+        # For I2I, there should be pixel_values from the source image
+        self.assertIn("input_ids", inputs)
+        self.assertIn("attention_mask", inputs)
+        self.assertIn("pixel_values", inputs)
+        self.assertIn("image_grid_thw", inputs)
+        # I2I should have 1 source grid + 1 target grid = 2 grids
+        self.assertEqual(inputs["image_grid_thw"].shape[0], 2)
+        # images_per_sample should be 2 (1 source + 1 target)
+        self.assertEqual(inputs["images_per_sample"].item(), 2)
+
+    def test_text_to_image_generation(self):
+        """Test text-to-image generation produces valid image tokens."""
+        model = self.get_model()
+        inputs = self.processor.apply_chat_template(
+            self.t2i_message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device)
+
+        # Generate image tokens with fixed seed for reproducibility
+        set_seed(42)
+        output = model.generate(**inputs, max_new_tokens=50, do_sample=False)
+
+        # Output should be longer than input (generated tokens)
+        self.assertGreater(output.shape[1], inputs["input_ids"].shape[1])
+        # Generated tokens should be within vision vocabulary range
+        generated_tokens = output[0, inputs["input_ids"].shape[1] :]
+        # Vision tokens are in range [0, vision_vocab_size)
+        self.assertTrue(all(t.item() < model.config.text_config.vision_vocab_size for t in generated_tokens))
+
+        # Check actual token values (first 30 tokens) to catch implementation errors
+        expected_tokens = torch.tensor(
+            [
+                671,
+                14581,
+                1275,
+                1275,
+                4508,
+                4508,
+                4508,
+                4508,
+                1471,
+                1471,
+                1153,
+                1153,
+                11241,
+                3596,
+                11241,
+                11942,
+                9695,
+                13748,
+                4508,
+                4508,
+                4508,
+                3136,
+                3136,
+                11241,
+                11241,
+                11241,
+                11241,
+                1755,
+                3136,
+                13748,
+            ],
+            device=torch_device,
+        )
+        self.assertTrue(
+            torch.equal(generated_tokens[:30], expected_tokens),
+            f"Expected first 30 tokens:\n{expected_tokens.tolist()}\nGot:\n{generated_tokens[:30].tolist()}",
+        )
+
+    @require_deterministic_for_xpu
+    def test_image_to_image_generation(self):
+        """Test image-to-image generation produces valid image tokens."""
+        model = self.get_model()
+        inputs = self.processor.apply_chat_template(
+            self.i2i_message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device)
+
+        # Generate image tokens with fixed seed for reproducibility
+        set_seed(42)
+        output = model.generate(**inputs, max_new_tokens=50, do_sample=False)
+
+        # Output should be longer than input (generated tokens)
+        self.assertGreater(output.shape[1], inputs["input_ids"].shape[1])
+        # Generated tokens should be within vision vocabulary range
+        generated_tokens = output[0, inputs["input_ids"].shape[1] :]
+        self.assertTrue(all(t.item() < model.config.text_config.vision_vocab_size for t in generated_tokens))
+
+        # Check actual token values (first 30 tokens) to catch implementation errors
+        # fmt: off
+        expected_tokens = Expectations(
+            {
+                ("cuda", None): [9223, 11045, 5705, 14581, 4759, 11667, 1275, 10094, 572, 10543, 9223, 1275, 9223, 10543, 12265, 10543, 2007, 8200, 10543, 1153, 1153, 1153, 10094, 16304, 9223, 11045, 3114, 14581, 4759, 10094],
+                ("xpu", 3): [9223, 11045, 11045, 14581, 4759, 11667, 10543, 10094, 572, 10543, 9223, 1275, 9223, 9223, 4759, 10543, 2007, 4759, 10543, 1153, 1153, 1153, 8932, 9223, 10094, 11045, 5705, 14581, 4759, 10094],
+            }
+        )
+        # fmt: on
+        expected = torch.tensor(expected_tokens.get_expectation(), device=torch_device)
+        self.assertTrue(
+            torch.equal(generated_tokens[:30], expected),
+            f"Expected first 30 tokens:\n{expected.tolist()}\nGot:\n{generated_tokens[:30].tolist()}",
+        )
+
+    @run_first
+    @require_flash_attn
+    @require_torch_accelerator
+    def test_flash_attention_generation(self):
+        """Test generation with Flash Attention 2."""
+        model = GlmImageForConditionalGeneration.from_pretrained(
+            self.model_id,
+            subfolder=self.model_subfolder,
+            torch_dtype=torch.bfloat16,
+            attn_implementation="flash_attention_2",
+            device_map="auto",
+        )
+        inputs = self.processor.apply_chat_template(
+            self.t2i_message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device)
+
+        # Generate image tokens
+        output = model.generate(**inputs, max_new_tokens=5)
+
+        # Output should be longer than input
+        self.assertGreater(output.shape[1], inputs["input_ids"].shape[1])
--- a/tests/models/glm_image/test_processor_glm_image.py
+++ b/tests/models/glm_image/test_processor_glm_image.py
@@ -0,0 +1,195 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from PIL import Image
+
+from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import AutoImageProcessor, AutoTokenizer, GlmImageProcessor
+
+if is_torch_available():
+    import torch
+
+
+@require_vision
+@require_torch
+class GlmImageProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = GlmImageProcessor
+    model_id = "zai-org/GLM-Image"
+
+    @classmethod
+    def _setup_test_attributes(cls, processor):
+        cls.image_token = processor.image_token
+
+    @classmethod
+    def _setup_from_pretrained(cls, model_id, **kwargs):
+        return super()._setup_from_pretrained(
+            model_id,
+            subfolder="processor",
+            **kwargs,
+        )
+
+    @classmethod
+    def _setup_image_processor(cls):
+        # Provide a tiny image-processor config so placeholder expansion stays small
+        return AutoImageProcessor.from_pretrained(
+            cls.model_id,
+            subfolder="processor",
+            do_resize=True,
+            patch_size=4,
+            min_pixels=12 * 12,
+            max_pixels=18 * 18,
+        )
+
+    @classmethod
+    def _setup_tokenizer(cls):
+        # Ensure tokenizer is loaded from the correct subfolder when using custom components
+        return AutoTokenizer.from_pretrained(cls.model_id, subfolder="processor")
+
+    def prepare_image_inputs(self, batch_size: int | None = None, nested: bool = False):
+        """Override to create images with valid aspect ratio (< 4) for GLM-Image."""
+        # GLM-Image requires aspect ratio < 4, so use near-square images
+        image_inputs = [Image.fromarray(np.random.randint(0, 255, (256, 256, 3), dtype=np.uint8))]
+        if batch_size is None:
+            return image_inputs
+        if nested:
+            return [image_inputs] * batch_size
+        return image_inputs * batch_size
+
+    @require_torch
+    @require_av
+    def _test_apply_chat_template(
+        self,
+        modality: str,
+        batch_size: int,
+        return_tensors: str,
+        input_name: str,
+        processor_name: str,
+        input_data: list[str],
+    ):
+        # Skip image modality tests for GLM-Image because the processor expands image tokens
+        # based on image size, making the tokenized output differ from direct tokenizer call
+        if modality == "image":
+            self.skipTest(
+                "GLM-Image processor expands image tokens based on image size, "
+                "making tokenized output differ from direct tokenizer call"
+            )
+
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        if processor_name not in self.processor_class.get_attributes():
+            self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
+
+        batch_messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "Describe this."}],
+                },
+            ]
+        ] * batch_size
+
+        # Test that jinja can be applied
+        formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), batch_size)
+
+        # Test that tokenizing with template and directly with `self.tokenizer` gives same output
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
+        )
+        add_special_tokens = True
+        if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
+            add_special_tokens = False
+        tok_output = processor.tokenizer(
+            formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
+        )
+        expected_output = tok_output.input_ids
+        self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
+
+        # Test that kwargs passed to processor's `__call__` are actually used
+        tokenized_prompt_100 = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            padding="max_length",
+            truncation=True,
+            return_tensors=return_tensors,
+            max_length=100,
+        )
+        self.assertEqual(len(tokenized_prompt_100[0]), 100)
+
+        # Test that `return_dict=True` returns text related inputs in the dict
+        out_dict_text = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors=return_tensors,
+        )
+        self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
+        self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
+        self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
+
+        # Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
+        for idx, url in enumerate(input_data[:batch_size]):
+            batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
+
+        out_dict = processor.apply_chat_template(
+            batch_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors=return_tensors,
+            fps=2
+            if isinstance(input_data[0], str)
+            else None,  # by default no more than 2 frames per second, otherwise too slow
+        )
+        input_name = getattr(self, input_name)
+        self.assertTrue(input_name in out_dict)
+        self.assertEqual(len(out_dict["input_ids"]), batch_size)
+        self.assertEqual(len(out_dict["attention_mask"]), batch_size)
+
+        mm_len = batch_size * 4
+        self.assertEqual(len(out_dict[input_name]), mm_len)
+
+        return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
+        for k in out_dict:
+            self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors])
+
+    def test_model_input_names(self):
+        processor = self.get_processor()
+
+        text = self.prepare_text_inputs(modalities=["image"])
+        image_input = self.prepare_image_inputs()
+        inputs_dict = {"text": text, "images": image_input}
+        inputs = processor(**inputs_dict, return_tensors="pt")
+
+        self.assertSetEqual(set(inputs.keys()), set(processor.model_input_names))
+
+    @unittest.skip(
+        "GlmImageProcessor injects additional special/control tokens around plain text inputs, so "
+        "`processor(text=X)` is not equivalent to `tokenizer(X)` for this model."
+    )
+    def test_tokenizer_defaults(self):
+        pass