# Copyright 2026 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Testing suite for the PyTorch LightOnOcr model.""" import copy import unittest from difflib import SequenceMatcher from transformers import ( LightOnOcrConfig, LightOnOcrForConditionalGeneration, LightOnOcrModel, LightOnOcrProcessor, is_torch_available, is_vision_available, ) from transformers.testing_utils import ( cleanup, require_torch, slow, torch_device, ) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor if is_torch_available(): import torch if is_vision_available(): from transformers.image_utils import load_image class LightOnOcrVisionText2TextModelTester: def __init__( self, parent, image_token_index=10, spatial_merge_size=2, seq_length=7, text_config={ "model_type": "qwen3", "seq_length": 7, "is_training": True, "use_input_mask": True, "use_token_type_ids": False, "use_labels": True, "vocab_size": 99, "hidden_size": 32, "num_hidden_layers": 2, "num_attention_heads": 4, "num_key_value_heads": 2, "intermediate_size": 37, "hidden_act": "silu", "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 16, "type_sequence_label_size": 2, "initializer_range": 0.02, "num_labels": 3, "num_choices": 4, "pad_token_id": 1, "bos_token_id": 0, "eos_token_id": 2, "rms_norm_eps": 1e-6, "rope_theta": 10000.0, "attention_bias": False, "attention_dropout": 0.0, "head_dim": 8, }, is_training=True, vision_config={ "image_size": 112, "patch_size": 14, "num_channels": 3, "is_training": True, "hidden_size": 32, "num_hidden_layers": 2, "num_attention_heads": 4, "intermediate_size": 37, "attention_dropout": 0.0, "hidden_act": "silu", "initializer_range": 0.02, "rope_theta": 10000.0, }, ): self.parent = parent self.image_token_index = image_token_index self.spatial_merge_size = spatial_merge_size self.text_config = text_config self.vision_config = vision_config self.pad_token_id = text_config["pad_token_id"] self.num_hidden_layers = text_config["num_hidden_layers"] self.vocab_size = text_config["vocab_size"] self.hidden_size = text_config["hidden_size"] self.num_attention_heads = text_config["num_attention_heads"] self.is_training = is_training self.batch_size = 3 self.num_channels = 3 # Image size must be divisible by patch_size self.image_size = vision_config["image_size"] self.patch_size = vision_config["patch_size"] # Number of patches after patch conv num_patches = (self.image_size // self.patch_size) ** 2 # After spatial merging, number of tokens is reduced by spatial_merge_size**2 self.num_image_tokens = num_patches // (self.spatial_merge_size**2) self.seq_length = seq_length + self.num_image_tokens self.encoder_seq_length = self.seq_length def get_config(self): return LightOnOcrConfig( text_config=self.text_config, vision_config=self.vision_config, image_token_id=self.image_token_index, spatial_merge_size=self.spatial_merge_size, ) def prepare_config_and_inputs(self): pixel_values = floats_tensor( [ self.batch_size, self.vision_config["num_channels"], self.vision_config["image_size"], self.vision_config["image_size"], ] ) config = self.get_config() return config, pixel_values def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 # Avoid placing image tokens on positions that would be the pad token input_ids[input_ids == config.image_token_id] = self.pad_token_id # Place image tokens at the beginning input_ids[:, : self.num_image_tokens] = config.image_token_id attention_mask = input_ids.ne(self.pad_token_id) # Create image_sizes as tensor - must match batch size image_sizes = torch.tensor([[self.image_size, self.image_size]] * self.batch_size, dtype=torch.long) inputs_dict = { "pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask, "image_sizes": image_sizes, } return config, inputs_dict def prepare_config_and_inputs_for_generate(self, batch_size=None): """Prepare config and inputs for generation tests.""" if batch_size is None: batch_size = self.batch_size # Get base config config = self.get_config() # Create pixel_values with the specified batch size pixel_values = floats_tensor( [ batch_size, self.vision_config["num_channels"], self.vision_config["image_size"], self.vision_config["image_size"], ] ) # Create input_ids input_ids = ids_tensor([batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 # Avoid placing image tokens on positions that would be the pad token input_ids[input_ids == config.image_token_id] = self.pad_token_id # Place image tokens at the beginning input_ids[:, : self.num_image_tokens] = config.image_token_id attention_mask = input_ids.ne(self.pad_token_id) # Create image_sizes as tensor - must match batch size image_sizes = torch.tensor([[self.image_size, self.image_size]] * batch_size, dtype=torch.long) inputs_dict = { "pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask, "image_sizes": image_sizes, } return config, inputs_dict @require_torch class LightOnOcrForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): """ Model tester for `LightOnOcrForConditionalGeneration`. """ all_model_classes = ( ( LightOnOcrModel, LightOnOcrForConditionalGeneration, ) if is_torch_available() else () ) pipeline_model_mapping = {"image-text-to-text": LightOnOcrForConditionalGeneration} if is_torch_available() else {} # LightOnOcr uses a PixtralVisionModel, which merges batch_size and num_patches in index 1, with index 0 hardcoded to 1 skip_test_image_features_output_shape = True _is_composite = True test_torch_exportable = False def setUp(self): self.model_tester = LightOnOcrVisionText2TextModelTester(self) common_properties = ["image_token_id", "spatial_merge_size"] self.config_tester = ConfigTester( self, config_class=LightOnOcrConfig, has_text_modality=False, common_properties=common_properties ) def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): """ Prepare inputs for the model class, ensuring image_sizes matches the batch size. """ inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) # Ensure image_sizes matches the batch size of pixel_values or input_ids if "pixel_values" in inputs_dict and "image_sizes" in inputs_dict: batch_size = inputs_dict["pixel_values"].shape[0] # If image_sizes doesn't match batch size, adjust it if len(inputs_dict["image_sizes"]) != batch_size: inputs_dict["image_sizes"] = inputs_dict["image_sizes"][:batch_size] return inputs_dict def prepare_config_and_inputs_for_generate(self, batch_size=1): """Override to use the model_tester's custom method.""" return self.model_tester.prepare_config_and_inputs_for_generate(batch_size=batch_size) def test_config(self): self.config_tester.run_common_tests() def test_mismatching_num_image_tokens(self): """ Tests that VLMs throw an error with explicit message saying what is wrong when number of images doesn't match number of image tokens in the text. Also we need to test multi-image cases when one prompt has multiple image tokens. """ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config).to(torch_device) model.eval() curr_input_dict = copy.deepcopy(input_dict) # in-place modifications further _ = model(**curr_input_dict) # successful forward with no modifications # remove one image but leave the image token in text curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...] curr_input_dict["image_sizes"] = curr_input_dict["image_sizes"][-1:] with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"): _ = model(**curr_input_dict) # simulate multi-image case by concatenating inputs where each has exactly one image/image-token input_ids = curr_input_dict["input_ids"][:1] pixel_values = curr_input_dict["pixel_values"][:1] image_sizes = curr_input_dict["image_sizes"][:1] input_ids = torch.cat([input_ids, input_ids], dim=0) # one image and two image tokens raise an error with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"): _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) # two images and two image tokens don't raise an error pixel_values = torch.cat([pixel_values, pixel_values], dim=0) image_sizes = torch.cat([image_sizes, image_sizes], dim=0) _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) def test_spatial_merge_size(self): """ Test that models can be created and initialized with different spatial_merge_size values. """ config, _ = self.model_tester.prepare_config_and_inputs_for_common() # Test that model can be created with different spatial_merge_size values for spatial_merge_size in [1, 2, 4]: curr_config = copy.deepcopy(config) curr_config.spatial_merge_size = spatial_merge_size for model_class in self.all_model_classes: # Build model with the new config - should not raise any errors model = model_class(curr_config).to(torch_device) model.eval() # Verify the spatial_merge_size is set correctly self.assertEqual(model.config.spatial_merge_size, spatial_merge_size) # Verify the model has the expected components if hasattr(model, "model"): self.assertTrue(hasattr(model.model, "vision_projection")) self.assertEqual(model.model.vision_projection.config.spatial_merge_size, spatial_merge_size) elif hasattr(model, "vision_projection"): self.assertEqual(model.vision_projection.config.spatial_merge_size, spatial_merge_size) def test_forward_pass_with_image_sizes(self): """ Test that the model correctly handles variable image sizes. """ config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config).to(torch_device) model.eval() # Test with different image sizes in the same batch batch_size = 2 pixel_values = floats_tensor( [batch_size, 3, self.model_tester.image_size, self.model_tester.image_size] ).to(torch_device) # Different image sizes (but still need to be divisible by patch_size) image_sizes = torch.tensor( [[self.model_tester.image_size, self.model_tester.image_size]] * batch_size, dtype=torch.long, device=torch_device, ) num_patches = (self.model_tester.image_size // self.model_tester.patch_size) ** 2 num_image_tokens = num_patches // (config.spatial_merge_size**2) input_ids = ids_tensor([batch_size, 10 + num_image_tokens], config.text_config.vocab_size - 1) + 1 # Ensure no tokens accidentally equal image_token_id input_ids[input_ids == config.image_token_id] = config.image_token_id + 1 # Now place image tokens at the beginning input_ids[:, :num_image_tokens] = config.image_token_id input_ids = input_ids.to(torch_device) outputs = model( pixel_values=pixel_values, input_ids=input_ids, image_sizes=image_sizes, ) self.assertIsNotNone(outputs) def test_model_outputs_equivalence(self): """ Test that model outputs are consistent across different input configurations. """ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config).to(torch_device) model.eval() with torch.no_grad(): outputs1 = model(**input_dict) outputs2 = model(**input_dict) # Check that outputs are deterministic if hasattr(outputs1, "last_hidden_state") and hasattr(outputs2, "last_hidden_state"): self.assertTrue(torch.allclose(outputs1.last_hidden_state, outputs2.last_hidden_state, atol=1e-5)) def test_vision_projection(self): """ Test that the vision projection correctly transforms vision embeddings to text space. """ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() model = LightOnOcrModel(config).to(torch_device) model.eval() # Convert image_sizes to list for vision_encoder if isinstance(input_dict["image_sizes"], torch.Tensor): image_sizes_list = [(int(h), int(w)) for h, w in input_dict["image_sizes"]] else: image_sizes_list = input_dict["image_sizes"] with torch.no_grad(): # Get vision features vision_outputs = model.vision_encoder( pixel_values=input_dict["pixel_values"].to(torch_device), image_sizes=image_sizes_list, ) # Project vision features projected = model.vision_projection( vision_outputs.last_hidden_state.squeeze(0), image_sizes_list, ) # Check output dimensions - should match text hidden size self.assertEqual(projected.shape[-1], config.text_config.hidden_size) def test_get_image_features(self): """ Test the get_image_features method. """ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() model = LightOnOcrModel(config).to(torch_device) model.eval() with torch.no_grad(): image_features_list = model.get_image_features( pixel_values=input_dict["pixel_values"].to(torch_device), image_sizes=input_dict["image_sizes"], ).pooler_output # Check that features are returned as a list self.assertIsNotNone(image_features_list) self.assertIsInstance(image_features_list, (list, tuple)) # Concatenate features and check shape image_features = torch.cat(image_features_list, dim=0) self.assertEqual(image_features.shape[-1], config.text_config.hidden_size) @slow @require_torch class LightOnOcrForConditionalGenerationIntegrationTest(unittest.TestCase): def tearDown(self): cleanup(torch_device, gc_collect=True) @slow def test_lightonocr_ocr_integration(self): """ Integration test for LightOnOcr OCR capabilities. Tests that the model can perform OCR on a real image and produce expected output. """ model_id = "lightonai/LightOnOCR-1B-1025" # Load processor and model from Hub processor = LightOnOcrProcessor.from_pretrained(model_id) model = LightOnOcrForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) model.eval() # Load a test OCR image # This is a standard OCR test image from HuggingFace fixtures image = load_image( "https://huggingface.co/datasets/hf-internal-testing/fixtures_ocr/resolve/main/SROIE-receipt.jpeg" ) # Process image and prepare inputs # Using chat template as shown in the model's usage pattern chat = [ { "role": "user", "content": [ {"type": "image", "url": image}, ], } ] inputs = processor.apply_chat_template( chat, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" ).to(torch_device, dtype=torch.bfloat16) # Generate OCR output with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=50, do_sample=False, num_beams=1, ) # Decode output, excluding the input prompt decoded_output = processor.decode(generated_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True) expected_output = "Document No : TD01167104\n\nDate : 25/12/2018 8:13:39 PM\n\nCashier : MANIS\n\nMember :\n\nCASH BILL\n\n| CODE" similarity = SequenceMatcher(None, decoded_output, expected_output).ratio() # Require at least 95% similarity to catch regressions while allowing minor variations self.assertGreater( similarity, 0.95, f"Model output differs too much from expected output (similarity: {similarity:.2%}).\n" f"Expected:\n{expected_output}\n\nGot:\n{decoded_output}", ) def test_model_can_generate_without_images(self): """ Test that the model can generate text without image inputs. """ # Create a small config for fast testing text_config = { "vocab_size": 100, "hidden_size": 64, "num_hidden_layers": 2, "num_attention_heads": 4, "num_key_value_heads": 2, "intermediate_size": 128, "max_position_embeddings": 512, "rms_norm_eps": 1e-6, "head_dim": 16, } vision_config = { "hidden_size": 64, "num_hidden_layers": 2, "num_attention_heads": 4, "intermediate_size": 128, "image_size": 112, "patch_size": 14, } config = LightOnOcrConfig(text_config=text_config, vision_config=vision_config, image_token_id=10) model = LightOnOcrForConditionalGeneration(config).to(torch_device) model.eval() # Create text-only input input_ids = torch.randint(0, config.text_config.vocab_size - 1, (1, 10), device=torch_device) + 1 with torch.no_grad(): outputs = model.generate(input_ids=input_ids, max_new_tokens=5) self.assertIsNotNone(outputs) self.assertEqual(outputs.shape[0], 1) self.assertGreater(outputs.shape[1], input_ids.shape[1]) def test_model_forward_with_images(self): """ Test forward pass with image inputs. """ text_config = { "vocab_size": 100, "hidden_size": 64, "num_hidden_layers": 2, "num_attention_heads": 4, "num_key_value_heads": 2, "intermediate_size": 128, "max_position_embeddings": 512, "rms_norm_eps": 1e-6, "head_dim": 16, } vision_config = { "hidden_size": 64, "num_hidden_layers": 2, "num_attention_heads": 4, "intermediate_size": 128, "image_size": 112, "patch_size": 14, } config = LightOnOcrConfig(text_config=text_config, vision_config=vision_config, image_token_id=10) model = LightOnOcrForConditionalGeneration(config).to(torch_device) model.eval() # Create inputs batch_size = 2 image_size = 112 pixel_values = torch.randn(batch_size, 3, image_size, image_size, device=torch_device) image_sizes = torch.tensor([[image_size, image_size]] * batch_size, dtype=torch.long, device=torch_device) # Calculate number of image tokens num_patches = (image_size // 14) ** 2 # patch_size = 14 num_image_tokens = num_patches // (config.spatial_merge_size**2) seq_len = num_image_tokens + 10 input_ids = torch.randint(0, config.text_config.vocab_size - 1, (batch_size, seq_len), device=torch_device) + 1 # Ensure no tokens accidentally equal image_token_id input_ids[input_ids == config.image_token_id] = config.image_token_id + 1 # Now place image tokens at the beginning input_ids[:, :num_image_tokens] = config.image_token_id with torch.no_grad(): outputs = model( pixel_values=pixel_values, input_ids=input_ids, image_sizes=image_sizes, ) self.assertIsNotNone(outputs) self.assertIsNotNone(outputs.logits) self.assertEqual(outputs.logits.shape[0], batch_size) self.assertEqual(outputs.logits.shape[1], seq_len) self.assertEqual(outputs.logits.shape[2], config.text_config.vocab_size)