first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/lighton_ocr/test_modeling_lighton_ocr.py
+++ b/tests/models/lighton_ocr/test_modeling_lighton_ocr.py
@@ -0,0 +1,598 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch LightOnOcr model."""
+
+import copy
+import unittest
+from difflib import SequenceMatcher
+
+from transformers import (
+    LightOnOcrConfig,
+    LightOnOcrForConditionalGeneration,
+    LightOnOcrModel,
+    LightOnOcrProcessor,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import (
+    cleanup,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from transformers.image_utils import load_image
+
+
+class LightOnOcrVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        image_token_index=10,
+        spatial_merge_size=2,
+        seq_length=7,
+        text_config={
+            "model_type": "qwen3",
+            "seq_length": 7,
+            "is_training": True,
+            "use_input_mask": True,
+            "use_token_type_ids": False,
+            "use_labels": True,
+            "vocab_size": 99,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "intermediate_size": 37,
+            "hidden_act": "silu",
+            "hidden_dropout_prob": 0.1,
+            "attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "type_vocab_size": 16,
+            "type_sequence_label_size": 2,
+            "initializer_range": 0.02,
+            "num_labels": 3,
+            "num_choices": 4,
+            "pad_token_id": 1,
+            "bos_token_id": 0,
+            "eos_token_id": 2,
+            "rms_norm_eps": 1e-6,
+            "rope_theta": 10000.0,
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "head_dim": 8,
+        },
+        is_training=True,
+        vision_config={
+            "image_size": 112,
+            "patch_size": 14,
+            "num_channels": 3,
+            "is_training": True,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "attention_dropout": 0.0,
+            "hidden_act": "silu",
+            "initializer_range": 0.02,
+            "rope_theta": 10000.0,
+        },
+    ):
+        self.parent = parent
+        self.image_token_index = image_token_index
+        self.spatial_merge_size = spatial_merge_size
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.pad_token_id = text_config["pad_token_id"]
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.is_training = is_training
+
+        self.batch_size = 3
+        self.num_channels = 3
+        # Image size must be divisible by patch_size
+        self.image_size = vision_config["image_size"]
+        self.patch_size = vision_config["patch_size"]
+        # Number of patches after patch conv
+        num_patches = (self.image_size // self.patch_size) ** 2
+        # After spatial merging, number of tokens is reduced by spatial_merge_size**2
+        self.num_image_tokens = num_patches // (self.spatial_merge_size**2)
+        self.seq_length = seq_length + self.num_image_tokens
+        self.encoder_seq_length = self.seq_length
+
+    def get_config(self):
+        return LightOnOcrConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            image_token_id=self.image_token_index,
+            spatial_merge_size=self.spatial_merge_size,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.vision_config["num_channels"],
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+
+        # Avoid placing image tokens on positions that would be the pad token
+        input_ids[input_ids == config.image_token_id] = self.pad_token_id
+
+        # Place image tokens at the beginning
+        input_ids[:, : self.num_image_tokens] = config.image_token_id
+
+        attention_mask = input_ids.ne(self.pad_token_id)
+
+        # Create image_sizes as tensor - must match batch size
+        image_sizes = torch.tensor([[self.image_size, self.image_size]] * self.batch_size, dtype=torch.long)
+
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "image_sizes": image_sizes,
+        }
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_generate(self, batch_size=None):
+        """Prepare config and inputs for generation tests."""
+        if batch_size is None:
+            batch_size = self.batch_size
+
+        # Get base config
+        config = self.get_config()
+
+        # Create pixel_values with the specified batch size
+        pixel_values = floats_tensor(
+            [
+                batch_size,
+                self.vision_config["num_channels"],
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+
+        # Create input_ids
+        input_ids = ids_tensor([batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+
+        # Avoid placing image tokens on positions that would be the pad token
+        input_ids[input_ids == config.image_token_id] = self.pad_token_id
+
+        # Place image tokens at the beginning
+        input_ids[:, : self.num_image_tokens] = config.image_token_id
+
+        attention_mask = input_ids.ne(self.pad_token_id)
+
+        # Create image_sizes as tensor - must match batch size
+        image_sizes = torch.tensor([[self.image_size, self.image_size]] * batch_size, dtype=torch.long)
+
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "image_sizes": image_sizes,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class LightOnOcrForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    """
+    Model tester for `LightOnOcrForConditionalGeneration`.
+    """
+
+    all_model_classes = (
+        (
+            LightOnOcrModel,
+            LightOnOcrForConditionalGeneration,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = {"image-text-to-text": LightOnOcrForConditionalGeneration} if is_torch_available() else {}
+    # LightOnOcr uses a PixtralVisionModel, which merges batch_size and num_patches in index 1, with index 0 hardcoded to 1
+    skip_test_image_features_output_shape = True
+
+    _is_composite = True
+    test_torch_exportable = False
+
+    def setUp(self):
+        self.model_tester = LightOnOcrVisionText2TextModelTester(self)
+        common_properties = ["image_token_id", "spatial_merge_size"]
+        self.config_tester = ConfigTester(
+            self, config_class=LightOnOcrConfig, has_text_modality=False, common_properties=common_properties
+        )
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        """
+        Prepare inputs for the model class, ensuring image_sizes matches the batch size.
+        """
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        # Ensure image_sizes matches the batch size of pixel_values or input_ids
+        if "pixel_values" in inputs_dict and "image_sizes" in inputs_dict:
+            batch_size = inputs_dict["pixel_values"].shape[0]
+            # If image_sizes doesn't match batch size, adjust it
+            if len(inputs_dict["image_sizes"]) != batch_size:
+                inputs_dict["image_sizes"] = inputs_dict["image_sizes"][:batch_size]
+
+        return inputs_dict
+
+    def prepare_config_and_inputs_for_generate(self, batch_size=1):
+        """Override to use the model_tester's custom method."""
+        return self.model_tester.prepare_config_and_inputs_for_generate(batch_size=batch_size)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs throw an error with explicit message saying what is wrong
+        when number of images doesn't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompt has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+            curr_input_dict = copy.deepcopy(input_dict)  # in-place modifications further
+            _ = model(**curr_input_dict)  # successful forward with no modifications
+
+            # remove one image but leave the image token in text
+            curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
+            curr_input_dict["image_sizes"] = curr_input_dict["image_sizes"][-1:]
+            with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"):
+                _ = model(**curr_input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = curr_input_dict["input_ids"][:1]
+            pixel_values = curr_input_dict["pixel_values"][:1]
+            image_sizes = curr_input_dict["image_sizes"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
+
+    def test_spatial_merge_size(self):
+        """
+        Test that models can be created and initialized with different spatial_merge_size values.
+        """
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Test that model can be created with different spatial_merge_size values
+        for spatial_merge_size in [1, 2, 4]:
+            curr_config = copy.deepcopy(config)
+            curr_config.spatial_merge_size = spatial_merge_size
+
+            for model_class in self.all_model_classes:
+                # Build model with the new config - should not raise any errors
+                model = model_class(curr_config).to(torch_device)
+                model.eval()
+
+                # Verify the spatial_merge_size is set correctly
+                self.assertEqual(model.config.spatial_merge_size, spatial_merge_size)
+
+                # Verify the model has the expected components
+                if hasattr(model, "model"):
+                    self.assertTrue(hasattr(model.model, "vision_projection"))
+                    self.assertEqual(model.model.vision_projection.config.spatial_merge_size, spatial_merge_size)
+                elif hasattr(model, "vision_projection"):
+                    self.assertEqual(model.vision_projection.config.spatial_merge_size, spatial_merge_size)
+
+    def test_forward_pass_with_image_sizes(self):
+        """
+        Test that the model correctly handles variable image sizes.
+        """
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+
+            # Test with different image sizes in the same batch
+            batch_size = 2
+            pixel_values = floats_tensor(
+                [batch_size, 3, self.model_tester.image_size, self.model_tester.image_size]
+            ).to(torch_device)
+
+            # Different image sizes (but still need to be divisible by patch_size)
+            image_sizes = torch.tensor(
+                [[self.model_tester.image_size, self.model_tester.image_size]] * batch_size,
+                dtype=torch.long,
+                device=torch_device,
+            )
+
+            num_patches = (self.model_tester.image_size // self.model_tester.patch_size) ** 2
+            num_image_tokens = num_patches // (config.spatial_merge_size**2)
+
+            input_ids = ids_tensor([batch_size, 10 + num_image_tokens], config.text_config.vocab_size - 1) + 1
+            # Ensure no tokens accidentally equal image_token_id
+            input_ids[input_ids == config.image_token_id] = config.image_token_id + 1
+            # Now place image tokens at the beginning
+            input_ids[:, :num_image_tokens] = config.image_token_id
+            input_ids = input_ids.to(torch_device)
+
+            outputs = model(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                image_sizes=image_sizes,
+            )
+
+            self.assertIsNotNone(outputs)
+
+    def test_model_outputs_equivalence(self):
+        """
+        Test that model outputs are consistent across different input configurations.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs1 = model(**input_dict)
+                outputs2 = model(**input_dict)
+
+            # Check that outputs are deterministic
+            if hasattr(outputs1, "last_hidden_state") and hasattr(outputs2, "last_hidden_state"):
+                self.assertTrue(torch.allclose(outputs1.last_hidden_state, outputs2.last_hidden_state, atol=1e-5))
+
+    def test_vision_projection(self):
+        """
+        Test that the vision projection correctly transforms vision embeddings to text space.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        model = LightOnOcrModel(config).to(torch_device)
+        model.eval()
+
+        # Convert image_sizes to list for vision_encoder
+        if isinstance(input_dict["image_sizes"], torch.Tensor):
+            image_sizes_list = [(int(h), int(w)) for h, w in input_dict["image_sizes"]]
+        else:
+            image_sizes_list = input_dict["image_sizes"]
+
+        with torch.no_grad():
+            # Get vision features
+            vision_outputs = model.vision_encoder(
+                pixel_values=input_dict["pixel_values"].to(torch_device),
+                image_sizes=image_sizes_list,
+            )
+
+            # Project vision features
+            projected = model.vision_projection(
+                vision_outputs.last_hidden_state.squeeze(0),
+                image_sizes_list,
+            )
+
+            # Check output dimensions - should match text hidden size
+            self.assertEqual(projected.shape[-1], config.text_config.hidden_size)
+
+    def test_get_image_features(self):
+        """
+        Test the get_image_features method.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        model = LightOnOcrModel(config).to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            image_features_list = model.get_image_features(
+                pixel_values=input_dict["pixel_values"].to(torch_device),
+                image_sizes=input_dict["image_sizes"],
+            ).pooler_output
+
+            # Check that features are returned as a list
+            self.assertIsNotNone(image_features_list)
+            self.assertIsInstance(image_features_list, (list, tuple))
+
+            # Concatenate features and check shape
+            image_features = torch.cat(image_features_list, dim=0)
+            self.assertEqual(image_features.shape[-1], config.text_config.hidden_size)
+
+
+@slow
+@require_torch
+class LightOnOcrForConditionalGenerationIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    def test_lightonocr_ocr_integration(self):
+        """
+        Integration test for LightOnOcr OCR capabilities.
+        Tests that the model can perform OCR on a real image and produce expected output.
+
+        """
+
+        model_id = "lightonai/LightOnOCR-1B-1025"
+
+        # Load processor and model from Hub
+        processor = LightOnOcrProcessor.from_pretrained(model_id)
+        model = LightOnOcrForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+        model.eval()
+
+        # Load a test OCR image
+        # This is a standard OCR test image from HuggingFace fixtures
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures_ocr/resolve/main/SROIE-receipt.jpeg"
+        )
+
+        # Process image and prepare inputs
+        # Using chat template as shown in the model's usage pattern
+        chat = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": image},
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            chat, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device, dtype=torch.bfloat16)
+
+        # Generate OCR output
+        with torch.no_grad():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=50,
+                do_sample=False,
+                num_beams=1,
+            )
+
+        # Decode output, excluding the input prompt
+        decoded_output = processor.decode(generated_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+        expected_output = "Document No : TD01167104\n\nDate : 25/12/2018 8:13:39 PM\n\nCashier : MANIS\n\nMember :\n\nCASH BILL\n\n| CODE"
+
+        similarity = SequenceMatcher(None, decoded_output, expected_output).ratio()
+
+        # Require at least 95% similarity to catch regressions while allowing minor variations
+        self.assertGreater(
+            similarity,
+            0.95,
+            f"Model output differs too much from expected output (similarity: {similarity:.2%}).\n"
+            f"Expected:\n{expected_output}\n\nGot:\n{decoded_output}",
+        )
+
+    def test_model_can_generate_without_images(self):
+        """
+        Test that the model can generate text without image inputs.
+        """
+        # Create a small config for fast testing
+        text_config = {
+            "vocab_size": 100,
+            "hidden_size": 64,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "intermediate_size": 128,
+            "max_position_embeddings": 512,
+            "rms_norm_eps": 1e-6,
+            "head_dim": 16,
+        }
+        vision_config = {
+            "hidden_size": 64,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 128,
+            "image_size": 112,
+            "patch_size": 14,
+        }
+
+        config = LightOnOcrConfig(text_config=text_config, vision_config=vision_config, image_token_id=10)
+        model = LightOnOcrForConditionalGeneration(config).to(torch_device)
+        model.eval()
+
+        # Create text-only input
+        input_ids = torch.randint(0, config.text_config.vocab_size - 1, (1, 10), device=torch_device) + 1
+
+        with torch.no_grad():
+            outputs = model.generate(input_ids=input_ids, max_new_tokens=5)
+
+        self.assertIsNotNone(outputs)
+        self.assertEqual(outputs.shape[0], 1)
+        self.assertGreater(outputs.shape[1], input_ids.shape[1])
+
+    def test_model_forward_with_images(self):
+        """
+        Test forward pass with image inputs.
+        """
+        text_config = {
+            "vocab_size": 100,
+            "hidden_size": 64,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 2,
+            "intermediate_size": 128,
+            "max_position_embeddings": 512,
+            "rms_norm_eps": 1e-6,
+            "head_dim": 16,
+        }
+        vision_config = {
+            "hidden_size": 64,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 128,
+            "image_size": 112,
+            "patch_size": 14,
+        }
+
+        config = LightOnOcrConfig(text_config=text_config, vision_config=vision_config, image_token_id=10)
+        model = LightOnOcrForConditionalGeneration(config).to(torch_device)
+        model.eval()
+
+        # Create inputs
+        batch_size = 2
+        image_size = 112
+        pixel_values = torch.randn(batch_size, 3, image_size, image_size, device=torch_device)
+        image_sizes = torch.tensor([[image_size, image_size]] * batch_size, dtype=torch.long, device=torch_device)
+
+        # Calculate number of image tokens
+        num_patches = (image_size // 14) ** 2  # patch_size = 14
+        num_image_tokens = num_patches // (config.spatial_merge_size**2)
+
+        seq_len = num_image_tokens + 10
+        input_ids = torch.randint(0, config.text_config.vocab_size - 1, (batch_size, seq_len), device=torch_device) + 1
+        # Ensure no tokens accidentally equal image_token_id
+        input_ids[input_ids == config.image_token_id] = config.image_token_id + 1
+        # Now place image tokens at the beginning
+        input_ids[:, :num_image_tokens] = config.image_token_id
+
+        with torch.no_grad():
+            outputs = model(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                image_sizes=image_sizes,
+            )
+
+        self.assertIsNotNone(outputs)
+        self.assertIsNotNone(outputs.logits)
+        self.assertEqual(outputs.logits.shape[0], batch_size)
+        self.assertEqual(outputs.logits.shape[1], seq_len)
+        self.assertEqual(outputs.logits.shape[2], config.text_config.vocab_size)