# Copyright 2026 the HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Testing suite for the PyTorch GLM-4.6V model.""" import copy import unittest import pytest from transformers import ( AutoProcessor, GlmOcrConfig, GlmOcrForConditionalGeneration, GlmOcrModel, is_torch_available, logging, ) from transformers.testing_utils import ( CaptureLogger, Expectations, cleanup, require_deterministic_for_xpu, require_flash_attn, require_torch, require_torch_accelerator, require_torch_greater_or_equal, set_config_for_less_flaky_test, set_model_for_less_flaky_test, slow, torch_device, ) from ...generation.test_utils import GenerationTesterMixin, assert_similar_generate_outputs from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, floats_tensor, ids_tensor, ) if is_torch_available(): import torch from transformers.cache_utils import DynamicCache from transformers.generation import CompileConfig class GlmOcrVisionText2TextModelTester: def __init__( self, parent, batch_size=3, seq_length=7, num_channels=3, ignore_index=-100, image_size=112, video_start_token_id=3, video_end_token_id=4, image_start_token_id=5, image_end_token_id=6, image_token_id=7, video_token_id=8, is_training=True, text_config={ "vocab_size": 99, "hidden_size": 16, "intermediate_size": 22, "num_hidden_layers": 2, "num_attention_heads": 2, "num_key_value_heads": 1, "output_channels": 64, "head_dim": 8, "hidden_act": "silu", "max_position_embeddings": 512, "rope_parameters": {"type": "default", "rope_theta": 10000, "mrope_section": [2, 1, 1]}, "tie_word_embeddings": True, "bos_token_id": 0, "eos_token_id": 0, "pad_token_id": 0, }, vision_config={ "depth": 2, "hidden_act": "silu", "hidden_size": 48, "num_heads": 12, "out_hidden_size": 16, "intermediate_size": 22, "patch_size": 14, "spatial_merge_size": 1, "temporal_patch_size": 2, }, ): self.parent = parent self.ignore_index = ignore_index self.bos_token_id = text_config["bos_token_id"] self.eos_token_id = text_config["eos_token_id"] self.pad_token_id = text_config["pad_token_id"] self.video_start_token_id = video_start_token_id self.video_end_token_id = video_end_token_id self.image_start_token_id = image_start_token_id self.image_end_token_id = image_end_token_id self.image_token_id = image_token_id self.video_token_id = video_token_id self.text_config = text_config self.vision_config = vision_config self.batch_size = batch_size self.num_channels = num_channels self.image_size = image_size self.is_training = is_training self.hidden_size = text_config["hidden_size"] self.num_hidden_layers = text_config["num_hidden_layers"] self.num_attention_heads = text_config["num_attention_heads"] self.vocab_size = text_config["vocab_size"] self.num_image_tokens = 64 self.seq_length = seq_length + self.num_image_tokens def get_config(self): return GlmOcrConfig( text_config=self.text_config, vision_config=self.vision_config, image_token_id=self.image_token_id, video_token_id=self.video_token_id, video_start_token_id=self.video_start_token_id, video_end_token_id=self.video_end_token_id, image_start_token_id=self.image_start_token_id, image_end_token_id=self.image_end_token_id, ) def prepare_config_and_inputs(self): config = self.get_config() patch_size = config.vision_config.patch_size temporal_patch_size = config.vision_config.temporal_patch_size pixel_values = floats_tensor( [ self.batch_size * (self.image_size**2) // (patch_size**2), self.num_channels * (patch_size**2) * temporal_patch_size, ] ) return config, pixel_values def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) input_ids[input_ids == self.video_token_id] = self.pad_token_id input_ids[input_ids == self.image_token_id] = self.pad_token_id input_ids[input_ids == self.video_start_token_id] = self.pad_token_id input_ids[input_ids == self.image_start_token_id] = self.pad_token_id input_ids[input_ids == self.video_end_token_id] = self.pad_token_id input_ids[input_ids == self.image_end_token_id] = self.pad_token_id input_ids[:, 0] = self.image_start_token_id input_ids[:, 1 : 1 + self.num_image_tokens] = self.image_token_id input_ids[:, 1 + self.num_image_tokens] = self.image_end_token_id patch_size = config.vision_config.patch_size patches_per_side = self.image_size // patch_size mm_token_type_ids = torch.zeros_like(input_ids) mm_token_type_ids[:, 1 : 1 + self.num_image_tokens] = 1 inputs_dict = { "pixel_values": pixel_values, "image_grid_thw": torch.tensor( [[1, patches_per_side, patches_per_side]] * self.batch_size, device=torch_device ), "input_ids": input_ids, "attention_mask": attention_mask, "mm_token_type_ids": mm_token_type_ids, } return config, inputs_dict @require_torch class GlmOcrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (GlmOcrModel, GlmOcrForConditionalGeneration) if is_torch_available() else () model_split_percents = [0.7, 0.9] # model too big to split at 0.5 _is_composite = True def setUp(self): self.model_tester = GlmOcrVisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=GlmOcrConfig, has_text_modality=False) def test_config(self): self.config_tester.run_common_tests() # GLM4V has images shaped as (bs*patch_len, dim) so we can't slice to batches in generate def prepare_config_and_inputs_for_generate(self, batch_size=2): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # We don't want a few model inputs in our model input dictionary for generation tests input_keys_to_ignore = [ # we don't want to mask attention heads # we don't want encoder-decoder models to start from filled decoder ids "decoder_input_ids", "decoder_attention_mask", # we'll set cache use in each test differently "use_cache", # Ignore labels if it is in the input dict "labels", # model-specific exceptions should overload/overwrite this function ] # The diff from the general `prepare_config_and_inputs_for_generate` lies here patch_size = config.vision_config.patch_size filtered_image_length = batch_size * (self.model_tester.image_size**2) // (patch_size**2) filtered_inputs_dict = { k: v[:batch_size, ...] if isinstance(v, torch.Tensor) else v for k, v in inputs_dict.items() if k not in input_keys_to_ignore } filtered_inputs_dict["pixel_values"] = inputs_dict["pixel_values"][:filtered_image_length] # It is important set `eos_token_id` to `None` to avoid early stopping (would break for length-based checks) text_gen_config = config.get_text_config(decoder=True) if text_gen_config.eos_token_id is not None and text_gen_config.pad_token_id is None: text_gen_config.pad_token_id = ( text_gen_config.eos_token_id if isinstance(text_gen_config.eos_token_id, int) else text_gen_config.eos_token_id[0] ) text_gen_config.eos_token_id = None text_gen_config.forced_eos_token_id = None return config, filtered_inputs_dict def test_inputs_embeds(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) model.to(torch_device) model.eval() inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) input_ids = inputs["input_ids"] del inputs["input_ids"] del inputs["pixel_values"] del inputs["image_grid_thw"] wte = model.get_input_embeddings() inputs["inputs_embeds"] = wte(input_ids) with torch.no_grad(): model(**inputs)[0] def test_inputs_embeds_matches_input_ids(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) model.to(torch_device) model.eval() inputs = self._prepare_for_class(inputs_dict, model_class) input_ids = inputs["input_ids"] del inputs["input_ids"] del inputs["pixel_values"] del inputs["image_grid_thw"] inputs_embeds = model.get_input_embeddings()(input_ids) with torch.no_grad(): out_ids = model(input_ids=input_ids, **inputs)[0] out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] torch.testing.assert_close(out_embeds, out_ids) @pytest.mark.generate @pytest.mark.torch_compile_test @require_torch_greater_or_equal("2.6") # Uses torch.compiler.set_stance def test_generate_compile_model_forward_fullgraph(self): """ Tests that `.generate` is compatible with torch.compile, keeping the same results. Also confirms that `.forward` called from `.generate` sees no graph breaks or recompilations when compiled. ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️ """ # GLM-OCR inputs cannot be split simply by batch size, therefore overriden for model_class in self.all_generative_model_classes: # 1. Test exclusion criteria if not model_class._can_compile_fullgraph: self.skipTest("This model doesn't support compilation without graph breaks") # 2. Prepares two sets of inputs config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=4) set_config_for_less_flaky_test(config) model = model_class(config).to(torch_device) set_model_for_less_flaky_test(model) model.eval() # otherwise `self.training` is `True` -- this flag is used at attn mask creation time # Some composite models have a custom generate and will call an inner model's generate -> that inner model # is the one that gets compiled. # (Note for the future: if BLIP starts causing problems, let's stop testing it) if "blip" in model.__class__.__name__.lower(): model_to_be_compiled = model.language_model else: model_to_be_compiled = model # creates two sets of *different* inputs with the same shape main_input = inputs_dict[model.main_input_name].to(torch_device) half_batch_size = main_input.shape[0] // 2 patch_size = config.vision_config.patch_size half_image_length = half_batch_size * (self.model_tester.image_size**2) // (patch_size**2) input_1 = {} input_2 = {} for key, value in inputs_dict.items(): if isinstance(value, torch.Tensor): input_1[key] = value[:half_batch_size, :].to(torch_device) input_2[key] = value[half_batch_size : half_batch_size * 2, :].to(torch_device) else: input_1[key] = value input_2[key] = value input_1["pixel_values"] = inputs_dict["pixel_values"][:half_image_length] input_2["pixel_values"] = inputs_dict["pixel_values"][half_image_length : half_image_length * 2] model_input_sets = [input_1, input_2] self.assertTrue( model_input_sets[0][model.main_input_name].shape == model_input_sets[1][model.main_input_name].shape ) # 3. compilation-specific setup and generation parameterization torch.compiler.reset() # prevent cached compilation from being used in the test has_defined_cache_implementation = model.generation_config.cache_implementation is not None compile_config = CompileConfig(fullgraph=True, dynamic=False) # Error out on dynamic shapes compile_config._compile_all_devices = True # force compilation (e.g. fast CI, CPU) generation_kwargs = { "use_cache": True, "do_sample": False, "max_new_tokens": 5, "return_dict_in_generate": True, "output_scores": True, "compile_config": compile_config, } # 4. get eager + dynamic cache results for future comparison dynamic_outputs = [] # Ignores all `torch.compile` usage, useful to test models that that have non-default compilable caches # (who would have used compilation in this section) with torch.compiler.set_stance("force_eager"): for model_inputs in model_input_sets: gen_out = model.generate(**model_inputs, **generation_kwargs) dynamic_outputs.append(gen_out) # sanity checks for the default cache implementation if not has_defined_cache_implementation: decoder_cache = ( gen_out.past_key_values.self_attention_cache if config.is_encoder_decoder else gen_out.past_key_values ) self.assertTrue(isinstance(decoder_cache, DynamicCache)) self.assertFalse(decoder_cache.is_compileable) # our auto compile should NOT have been called self.assertFalse(hasattr(model_to_be_compiled, "_compiled_call")) # 5. get compiled results -- relies on the automatic compilation triggered by specific compilable caches if not has_defined_cache_implementation: generation_kwargs["cache_implementation"] = "static" compiled_outputs = [] # Uses a context manager to catch recompilation logs. If there is any recompilation, this test fails. # Try/Finally is used to ensure that the log options are reset even if an error is raised. try: torch._logging.set_logs(recompiles_verbose=True) logger = logging.get_logger("torch._dynamo.guards") with CaptureLogger(logger) as cl: for model_inputs in model_input_sets: # with torch.compiler.set_stance("fail_on_recompile"): gen_out = model.generate(**model_inputs, **generation_kwargs) compiled_outputs.append(gen_out) # sanity checks decoder_cache = ( gen_out.past_key_values.self_attention_cache if config.is_encoder_decoder else gen_out.past_key_values ) self.assertFalse(isinstance(decoder_cache, DynamicCache)) self.assertTrue(decoder_cache.is_compileable) # our auto compile should have been called self.assertTrue(hasattr(model_to_be_compiled, "_compiled_call")) finally: torch._logging.set_logs() # Compilation of sliding layers necessarily has recompiles with `dynamic=False` - however this test # still checks that `fullgraph=True` is supported in this case, as compilation with `dynamic=None` # is the default and does not actually lead to too many recompiles has_sliding_layers = any(decoder_cache.is_sliding) has_recompilation = "Recompiling" in cl.out or ("guard" in cl.out and "failure" in cl.out) if not has_sliding_layers and has_recompilation: raise RuntimeError( f"`torch.compile` recompiled part of the forward pass in {model.__class__.__name__}. " "See the test logs for more details." ) for dynamic_result, compiled_result in zip(dynamic_outputs, compiled_outputs): assert_similar_generate_outputs(dynamic_result, compiled_result) @require_torch class GlmOcrIntegrationTest(unittest.TestCase): def setUp(self): cleanup(torch_device, gc_collect=True) self.processor = AutoProcessor.from_pretrained("zai-org/GLM-OCR") self.message = [ { "role": "user", "content": [ { "type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", }, {"type": "text", "text": "What kind of dog is this?"}, ], } ] self.message2 = [ { "role": "user", "content": [ { "type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png", }, {"type": "text", "text": "What kind of dog is this?"}, ], } ] def tearDown(self): cleanup(torch_device, gc_collect=True) @slow def test_small_model_integration_test(self): model = GlmOcrForConditionalGeneration.from_pretrained("zai-org/GLM-OCR", dtype="auto", device_map="auto") inputs = self.processor.apply_chat_template( self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ) expected_input_ids = [151331, 151333, 151336, 198, 151339, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343] # fmt: skip assert expected_input_ids == inputs.input_ids[0].tolist()[:17] expected_pixel_slice = torch.tensor( [ [-0.0988, -0.0842, -0.0842], [-0.5660, -0.5514, -0.4200], [-0.0259, -0.0259, -0.0259], [-0.1280, -0.0988, -0.2010], [-0.4638, -0.5806, -0.6974], [-1.2083, -1.2229, -1.2083], ], dtype=torch.float32, device="cpu", ) assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3) # verify generation inputs = inputs.to(torch_device) # This model on the hub has `do_sample=True`. torch.manual_seed(42) output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically" self.assertEqual( self.processor.decode(output[0], skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) @slow def test_small_model_integration_test_batch(self): model = GlmOcrForConditionalGeneration.from_pretrained("zai-org/GLM-OCR", dtype="auto", device_map="auto") batch_messages = [self.message] * 2 inputs = self.processor.apply_chat_template( batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ).to(torch_device) # This model on the hub has `do_sample=True`. torch.manual_seed(42) # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture has a stocky body, thick fur, and a face that's" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) @slow def test_small_model_integration_test_with_video(self): processor = AutoProcessor.from_pretrained("zai-org/GLM-OCR", max_image_size={"longest_edge": 50176}) model = GlmOcrForConditionalGeneration.from_pretrained( "zai-org/GLM-OCR", dtype=torch.float16, device_map="auto" ) questions = ["Describe this video."] video_urls = ["https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"] messages = [ [ { "role": "user", "content": [ { "type": "video", "video": video_url, }, {"type": "text", "text": question}, ], } ] for question, video_url in zip(questions, video_urls) ] inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True ).to(torch_device) # This model on the hub has `do_sample=True`. torch.manual_seed(42) output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\nGot it, let's analyze the video. First, the scene is an indoor tennis court. There are two players: one in a white shirt"] # fmt: skip self.assertEqual( processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) @slow @require_deterministic_for_xpu def test_small_model_integration_test_expand(self): model = GlmOcrForConditionalGeneration.from_pretrained("zai-org/GLM-OCR", dtype="auto", device_map="auto") inputs = self.processor.apply_chat_template( self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ).to(torch_device) # This model on the hub has `do_sample=True`. torch.manual_seed(42) output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2) # fmt: off EXPECTED_DECODED_TEXTS = Expectations( { (None, None): ["\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically" ], ("xpu", None): ["\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks", "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture is not a dog; it's a cat, specifically a Pallas" ], } ) # fmt: on EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation() decoded_text = self.processor.batch_decode(output, skip_special_tokens=True) self.assertEqual(decoded_text, EXPECTED_DECODED_TEXT) @slow def test_small_model_integration_test_batch_wo_image(self): model = GlmOcrForConditionalGeneration.from_pretrained("zai-org/GLM-OCR", dtype="auto", device_map="auto") message_wo_image = [ {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, ] batched_messages = [self.message, message_wo_image] inputs = self.processor.apply_chat_template( batched_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True, ).to(torch_device) # This model on the hub has `do_sample=True`. torch.manual_seed(42) # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", "\nWho are you?\nGot it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) @slow def test_small_model_integration_test_batch_different_resolutions(self): model = GlmOcrForConditionalGeneration.from_pretrained("zai-org/GLM-OCR", dtype="auto", device_map="auto") batched_messages = [self.message, self.message2] inputs = self.processor.apply_chat_template( batched_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True, ).to(torch_device) # This model on the hub has `do_sample=True`. torch.manual_seed(42) # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but", ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) @slow @require_flash_attn @require_torch_accelerator def test_small_model_integration_test_batch_flashatt2(self): model = GlmOcrForConditionalGeneration.from_pretrained( "zai-org/GLM-OCR", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto", ) batched_messages = [self.message, self.message2] inputs = self.processor.apply_chat_template( batched_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True, ).to(torch_device) # This model on the hub has `do_sample=True`. torch.manual_seed(42) # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog. Wait, it's a cat,", "\nWhat kind of dog is this?\nGot it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) @slow @require_flash_attn @require_torch_accelerator def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = GlmOcrForConditionalGeneration.from_pretrained( "zai-org/GLM-OCR", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto", ) message_wo_image = [ {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, ] batched_messages = [self.message, message_wo_image] inputs = self.processor.apply_chat_template( batched_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True, ).to(torch_device) # This model on the hub has `do_sample=True`. torch.manual_seed(42) # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30) EXPECTED_DECODED_TEXT = [ "\nWhat kind of dog is this?\nGot it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically", "\nWho are you?\nGot it, let's look at the user's question: \"Who are you?\" This is a common question when someone is just starting a conversation" ] # fmt: skip self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, )