first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/deepseek_ocr2/init.py
+++ b/tests/models/deepseek_ocr2/init.py
--- a/tests/models/deepseek_ocr2/test_image_processing_deepseek_ocr2.py
+++ b/tests/models/deepseek_ocr2/test_image_processing_deepseek_ocr2.py
@@ -0,0 +1,216 @@
+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+
+class DeepseekOcr2ImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=500,
+        max_resolution=800,
+        do_resize=True,
+        size=None,
+        tile_size=384,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 512, "width": 512}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.tile_size = tile_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "tile_size": self.tile_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class DeepseekOcr2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = DeepseekOcr2ImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_resize"))
+            self.assertTrue(hasattr(image_processor, "size"))
+            self.assertTrue(hasattr(image_processor, "tile_size"))
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "image_mean"))
+            self.assertTrue(hasattr(image_processor, "image_std"))
+            self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+    @unittest.skip(reason="Not supported")
+    def test_call_numpy_4_channels(self):
+        pass
+
+    def test_crop_to_patches(self):
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            tile_size = self.image_processor_tester.tile_size
+            if backend_name == "pil":
+                image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)[0]
+                processed_images = image_processor.crop_image_to_patches(
+                    image, min_patches=1, max_patches=6, tile_size=tile_size
+                )
+                self.assertGreater(len(processed_images), 0)
+                self.assertEqual(processed_images[0].shape[:2], (tile_size, tile_size))
+            else:
+                image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)[0]
+                stacked_patches, n_patches = image_processor.crop_image_to_patches(
+                    image.unsqueeze(0).float(), min_patches=1, max_patches=6, tile_size=tile_size
+                )
+                self.assertGreater(n_patches, 0)
+                self.assertEqual(stacked_patches.shape[-2:], (tile_size, tile_size))
+
+    def test_preprocess_global_only(self):
+        """Test preprocessing without crop_to_patches (global view only)."""
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict, crop_to_patches=False)
+            images = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=False)
+            result = image_processor(images, return_tensors="pt")
+            self.assertIn("pixel_values", result)
+            self.assertEqual(len(result["num_local_patches"]), len(images))
+            for n in result["num_local_patches"]:
+                self.assertEqual(n, 0)
+
+    def test_preprocess_with_crop_to_patches(self):
+        """Test preprocessing with crop_to_patches enabled."""
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict, crop_to_patches=True)
+            images = prepare_image_inputs(
+                batch_size=2, num_channels=3, min_resolution=500, max_resolution=700, equal_resolution=True
+            )
+            result = image_processor(images, return_tensors="pt")
+            self.assertIn("pixel_values", result)
+            has_local = any(n > 0 for n in result["num_local_patches"])
+            self.assertTrue(has_local)
+            if has_local:
+                self.assertIn("pixel_values_local", result)
+
+    def test_backends_equivalence(self):
+        """Override to also compare pixel_values_local and num_local_patches."""
+        if len(self.image_processing_classes) < 2:
+            self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
+
+        dummy_image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)[0]
+
+        encodings = {}
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            encodings[backend_name] = image_processor(dummy_image, return_tensors="pt")
+
+        backend_names = list(encodings.keys())
+        reference_backend = backend_names[0]
+        for backend_name in backend_names[1:]:
+            self._assert_tensors_equivalence(
+                encodings[reference_backend].pixel_values, encodings[backend_name].pixel_values
+            )
+            torch.testing.assert_close(
+                encodings[reference_backend].num_local_patches, encodings[backend_name].num_local_patches
+            )
+            if encodings[reference_backend].get("pixel_values_local") is not None:
+                self._assert_tensors_equivalence(
+                    encodings[reference_backend].pixel_values_local,
+                    encodings[backend_name].pixel_values_local,
+                )
+
+    def test_backends_equivalence_batched(self):
+        """Override to also compare pixel_values_local and num_local_patches (variable shape)."""
+        if len(self.image_processing_classes) < 2:
+            self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+        encodings = {}
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            encodings[backend_name] = image_processor(dummy_images, return_tensors=None)
+
+        backend_names = list(encodings.keys())
+        reference_backend = "pil"
+        ref_encoding = encodings[reference_backend]
+
+        for backend_name in [b for b in backend_names if b != reference_backend]:
+            other_encoding = encodings[backend_name]
+            # Global views
+            for i in range(len(ref_encoding.pixel_values)):
+                self._assert_tensors_equivalence(
+                    torch.from_numpy(ref_encoding.pixel_values[i]), other_encoding.pixel_values[i]
+                )
+            # num_local_patches
+            self.assertEqual(
+                list(ref_encoding["num_local_patches"]),
+                list(other_encoding["num_local_patches"]),
+            )
+            # Local patches
+            ref_local = ref_encoding.get("pixel_values_local")
+            other_local = other_encoding.get("pixel_values_local")
+            if ref_local is not None and other_local is not None:
+                self.assertEqual(len(ref_local), len(other_local))
+                for i in range(len(ref_local)):
+                    self._assert_tensors_equivalence(torch.from_numpy(ref_local[i]), other_local[i])
--- a/tests/models/deepseek_ocr2/test_modeling_deepseek_ocr2.py
+++ b/tests/models/deepseek_ocr2/test_modeling_deepseek_ocr2.py
@@ -0,0 +1,241 @@
+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch DeepseekOcr2 model."""
+
+import unittest
+
+from transformers import (
+    AutoProcessor,
+    DeepseekOcr2Config,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import Expectations, cleanup, require_torch, slow, torch_device
+
+from ...test_processing_common import url_to_local_path
+from ...vlm_tester import VLMModelTest, VLMModelTester
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DeepseekOcr2ForConditionalGeneration,
+        DeepseekOcr2Model,
+    )
+    from transformers.models.deepseek_ocr2.configuration_deepseek_ocr2 import (
+        DeepseekOcr2TextConfig,
+        DeepseekOcr2VisionConfig,
+    )
+
+if is_vision_available():
+    from transformers.image_utils import load_image
+
+
+class DeepseekOcr2VisionText2TextModelTester(VLMModelTester):
+    base_model_class = DeepseekOcr2Model
+    config_class = DeepseekOcr2Config
+    conditional_generation_class = DeepseekOcr2ForConditionalGeneration
+    text_config_class = DeepseekOcr2TextConfig
+    vision_config_class = DeepseekOcr2VisionConfig
+
+    def __init__(self, parent, **kwargs):
+        # VisionModel always selects query_768_resolution (144 tokens) for small images + 1 separator
+        kwargs.setdefault("num_image_tokens", 145)
+        kwargs.setdefault("image_token_id", 1)
+        kwargs.setdefault("image_size", 16)
+        kwargs.setdefault("hidden_size", 128)
+        kwargs.setdefault("intermediate_size", 256)
+        kwargs.setdefault("num_hidden_layers", 2)
+        kwargs.setdefault("num_attention_heads", 4)
+        kwargs.setdefault("num_key_value_heads", 4)
+        kwargs.setdefault("hidden_act", "silu")
+        kwargs.setdefault("max_position_embeddings", 512)
+        kwargs.setdefault("tie_word_embeddings", False)
+        kwargs.setdefault("bos_token_id", 2)
+        kwargs.setdefault("eos_token_id", 3)
+        kwargs.setdefault("pad_token_id", 4)
+        kwargs.setdefault("n_routed_experts", 8)
+        kwargs.setdefault("n_shared_experts", 1)
+        kwargs.setdefault("mlp_layer_types", ["dense", "sparse"])
+        kwargs.setdefault("moe_intermediate_size", 64)
+        kwargs.setdefault("num_experts_per_tok", 2)
+        super().__init__(parent, **kwargs)
+
+        self.sam_config = {
+            "hidden_size": 32,
+            "output_channels": 16,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_channels": 3,
+            "image_size": 16,
+            "patch_size": 2,
+            "hidden_act": "gelu",
+            "mlp_ratio": 4.0,
+            "window_size": 4,
+            "global_attn_indexes": [1],
+            "downsample_channels": [32, 64],
+        }
+        self.encoder_config = {
+            "hidden_size": 64,
+            "intermediate_size": 128,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 4,
+            "hidden_act": "silu",
+            "max_position_embeddings": 512,
+            "rms_norm_eps": 1.0,
+        }
+
+    def get_vision_config(self):
+        return DeepseekOcr2VisionConfig(
+            sam_config=self.sam_config,
+            encoder_config=self.encoder_config,
+        )
+
+    def get_config(self):
+        return self.config_class(
+            vision_config=self.get_vision_config(),
+            text_config=self.get_text_config(),
+            image_token_id=self.image_token_id,
+        )
+
+
+@require_torch
+class DeepseekOcr2ModelTest(VLMModelTest, unittest.TestCase):
+    model_tester_class = DeepseekOcr2VisionText2TextModelTester
+    test_all_params_have_gradient = False
+    test_torch_exportable = False
+
+    @unittest.skip(
+        reason="DeepseekOcr2VisionModel builds a hybrid bidirectional+causal mask internally, so SDPA is always called with a non-null `attn_mask`."
+    )
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+    @unittest.skip(
+        reason="DeepseekOcr2VisionModel uses `self.query_*.weight` directly, causing device mismatch when offloading."
+    )
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip(
+        reason="DeepseekOcr2VisionModel uses `self.query_*.weight` directly, causing device mismatch when offloading."
+    )
+    def test_disk_offload_bin(self):
+        pass
+
+    @unittest.skip(
+        reason="DeepseekOcr2VisionModel uses `self.query_*.weight` directly, causing device mismatch when offloading."
+    )
+    def test_disk_offload_safetensors(self):
+        pass
+
+    def _image_features_prepare_config_and_inputs(self):
+        config, inputs_dict = super()._image_features_prepare_config_and_inputs()
+        # test_get_image_features_output expects vision_config.hidden_size, but ours is in encoder_config.
+        config.vision_config.hidden_size = config.vision_config.encoder_config.hidden_size
+        return config, inputs_dict
+
+
+@require_torch
+class DeepseekOcr2IntegrationTest(unittest.TestCase):
+    model_id = "deepseek-community/DeepSeek-OCR-2"
+
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained(self.model_id)
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    def test_small_model_integration_test_free_ocr(self):
+        model = DeepseekOcr2ForConditionalGeneration.from_pretrained(
+            self.model_id, torch_dtype=torch.bfloat16, device_map=torch_device
+        )
+        image = load_image(
+            url_to_local_path(
+                "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
+            )
+        )
+        inputs = self.processor(images=image, text="<image>\nFree OCR.", return_tensors="pt").to(
+            model.device, dtype=torch.bfloat16
+        )
+        generate_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+        decoded = self.processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): "R&D QUALITY IMPROVEMENT SUGGESTION/SOLUTION FORM\n\nName/",
+            }
+        ).get_expectation()  # fmt: skip
+        self.assertEqual(decoded, EXPECTED_DECODED_TEXT)
+
+    @slow
+    def test_small_model_integration_test_grounding_markdown(self):
+        model = DeepseekOcr2ForConditionalGeneration.from_pretrained(
+            self.model_id, torch_dtype=torch.bfloat16, device_map=torch_device
+        )
+        image = load_image(
+            url_to_local_path(
+                "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
+            )
+        )
+        inputs = self.processor(
+            images=image,
+            text="<image>\n<|grounding|>Convert the document to markdown.",
+            return_tensors="pt",
+        ).to(model.device, dtype=torch.bfloat16)
+        generate_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+        decoded = self.processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=False)
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): "<|ref|>title<|/ref|><|det|>[[330, 198, 559, 230]]<|/det|>\n# R",
+            }
+        ).get_expectation()  # fmt: skip
+        self.assertEqual(decoded, EXPECTED_DECODED_TEXT)
+
+    @slow
+    def test_small_model_integration_test_batched(self):
+        model = DeepseekOcr2ForConditionalGeneration.from_pretrained(
+            self.model_id, torch_dtype=torch.bfloat16, device_map=torch_device
+        )
+        image1 = load_image(
+            url_to_local_path(
+                "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
+            )
+        )
+        image2 = load_image(
+            url_to_local_path(
+                "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+            )
+        )
+        inputs = self.processor(
+            images=[image1, image2],
+            text=["<image>\nFree OCR.", "<image>\nFree OCR."],
+            return_tensors="pt",
+            padding=True,
+        ).to(model.device, dtype=torch.bfloat16)
+        generate_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+        decoded = self.processor.batch_decode(
+            generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        )
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): [
+                    "R&D QUALITY IMPROVEMENT SUGGESTION/SOLUTION FORM\n\nName/",
+                    "# Reducing the number of images\n\nIt is also believed that the performance of a website is a critical",
+                ],
+            }
+        ).get_expectation()  # fmt: skip
+        self.assertEqual(decoded, EXPECTED_DECODED_TEXT)
--- a/tests/models/deepseek_ocr2/test_processing_deepseek_ocr2.py
+++ b/tests/models/deepseek_ocr2/test_processing_deepseek_ocr2.py
@@ -0,0 +1,90 @@
+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from transformers import DeepseekOcr2Processor
+from transformers.testing_utils import require_vision
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+@require_vision
+class DeepseekOcr2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = DeepseekOcr2Processor
+
+    @classmethod
+    def _setup_image_processor(cls):
+        image_processor_class = cls._get_component_class_from_processor("image_processor")
+        image_processor = image_processor_class()
+        image_processor.size = {"height": 64, "width": 64}
+        image_processor.tile_size = 512
+        return image_processor
+
+    @classmethod
+    def _setup_tokenizer(cls):
+        tokenizer_class = cls._get_component_class_from_processor("tokenizer")
+        tokenizer = tokenizer_class.from_pretrained("deepseek-community/DeepSeek-OCR-2")
+        return tokenizer
+
+    @classmethod
+    def _setup_test_attributes(cls, processor):
+        cls.image_token = processor.image_token
+
+    @unittest.skip("DeepseekOcr2Processor pops the image processor output 'num_local_patches'")
+    def test_image_processor_defaults(self):
+        pass
+
+    def test_image_token_expansion_small_image(self):
+        """Small image (< tile_size) should produce no local patches → 257 image tokens."""
+        processor = self.get_processor()
+        processor.image_processor.size = {"height": 1024, "width": 1024}
+        processor.image_processor.tile_size = 768
+
+        # Small image: max(200, 300) < 768 → no local patches
+        image = torch.randint(0, 256, (3, 300, 200), dtype=torch.uint8)
+        prompt = "<image>\nFree OCR."
+
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        image_token_id = processor.image_token_id
+        num_image_tokens = (inputs["input_ids"] == image_token_id).sum().item()
+
+        # 257 = 256 global + 0 local + 1 separator
+        self.assertEqual(num_image_tokens, 257)
+        self.assertNotIn("pixel_values_local", inputs)
+
+    def test_image_token_expansion_large_image(self):
+        """Large image should produce local patches → more image tokens."""
+        processor = self.get_processor()
+        processor.image_processor.size = {"height": 1024, "width": 1024}
+        processor.image_processor.tile_size = 768
+
+        # Large image: max(2448, 3264) > 768 → local patches
+        image = torch.randint(0, 256, (3, 3264, 2448), dtype=torch.uint8)
+        prompt = "<image>\nFree OCR."
+
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        image_token_id = processor.image_token_id
+        num_image_tokens = (inputs["input_ids"] == image_token_id).sum().item()
+        num_local_patches = inputs["num_local_patches"][0]
+
+        # 3264x2448 image produces 6 local patches (2x3 grid) + 1 global view = 7 total
+        # num_image_tokens = 256 global + 144*6 local + 1 separator = 1121
+        self.assertEqual(num_local_patches, 6)
+        self.assertEqual(num_image_tokens, 1121)
+        self.assertIn("pixel_values_local", inputs)