first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,216 @@
# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
class DeepseekOcr2ImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=500,
max_resolution=800,
do_resize=True,
size=None,
tile_size=384,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_convert_rgb=True,
):
size = size if size is not None else {"height": 512, "width": 512}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.tile_size = tile_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"tile_size": self.tile_size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.size["height"], self.size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class DeepseekOcr2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = DeepseekOcr2ImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processor, "do_resize"))
self.assertTrue(hasattr(image_processor, "size"))
self.assertTrue(hasattr(image_processor, "tile_size"))
self.assertTrue(hasattr(image_processor, "do_normalize"))
self.assertTrue(hasattr(image_processor, "image_mean"))
self.assertTrue(hasattr(image_processor, "image_std"))
self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
@unittest.skip(reason="Not supported")
def test_call_numpy_4_channels(self):
pass
def test_crop_to_patches(self):
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
tile_size = self.image_processor_tester.tile_size
if backend_name == "pil":
image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)[0]
processed_images = image_processor.crop_image_to_patches(
image, min_patches=1, max_patches=6, tile_size=tile_size
)
self.assertGreater(len(processed_images), 0)
self.assertEqual(processed_images[0].shape[:2], (tile_size, tile_size))
else:
image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)[0]
stacked_patches, n_patches = image_processor.crop_image_to_patches(
image.unsqueeze(0).float(), min_patches=1, max_patches=6, tile_size=tile_size
)
self.assertGreater(n_patches, 0)
self.assertEqual(stacked_patches.shape[-2:], (tile_size, tile_size))
def test_preprocess_global_only(self):
"""Test preprocessing without crop_to_patches (global view only)."""
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class(**self.image_processor_dict, crop_to_patches=False)
images = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=False)
result = image_processor(images, return_tensors="pt")
self.assertIn("pixel_values", result)
self.assertEqual(len(result["num_local_patches"]), len(images))
for n in result["num_local_patches"]:
self.assertEqual(n, 0)
def test_preprocess_with_crop_to_patches(self):
"""Test preprocessing with crop_to_patches enabled."""
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class(**self.image_processor_dict, crop_to_patches=True)
images = prepare_image_inputs(
batch_size=2, num_channels=3, min_resolution=500, max_resolution=700, equal_resolution=True
)
result = image_processor(images, return_tensors="pt")
self.assertIn("pixel_values", result)
has_local = any(n > 0 for n in result["num_local_patches"])
self.assertTrue(has_local)
if has_local:
self.assertIn("pixel_values_local", result)
def test_backends_equivalence(self):
"""Override to also compare pixel_values_local and num_local_patches."""
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
dummy_image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)[0]
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_image, return_tensors="pt")
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
for backend_name in backend_names[1:]:
self._assert_tensors_equivalence(
encodings[reference_backend].pixel_values, encodings[backend_name].pixel_values
)
torch.testing.assert_close(
encodings[reference_backend].num_local_patches, encodings[backend_name].num_local_patches
)
if encodings[reference_backend].get("pixel_values_local") is not None:
self._assert_tensors_equivalence(
encodings[reference_backend].pixel_values_local,
encodings[backend_name].pixel_values_local,
)
def test_backends_equivalence_batched(self):
"""Override to also compare pixel_values_local and num_local_patches (variable shape)."""
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_images, return_tensors=None)
backend_names = list(encodings.keys())
reference_backend = "pil"
ref_encoding = encodings[reference_backend]
for backend_name in [b for b in backend_names if b != reference_backend]:
other_encoding = encodings[backend_name]
# Global views
for i in range(len(ref_encoding.pixel_values)):
self._assert_tensors_equivalence(
torch.from_numpy(ref_encoding.pixel_values[i]), other_encoding.pixel_values[i]
)
# num_local_patches
self.assertEqual(
list(ref_encoding["num_local_patches"]),
list(other_encoding["num_local_patches"]),
)
# Local patches
ref_local = ref_encoding.get("pixel_values_local")
other_local = other_encoding.get("pixel_values_local")
if ref_local is not None and other_local is not None:
self.assertEqual(len(ref_local), len(other_local))
for i in range(len(ref_local)):
self._assert_tensors_equivalence(torch.from_numpy(ref_local[i]), other_local[i])

View File

@@ -0,0 +1,241 @@
# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch DeepseekOcr2 model."""
import unittest
from transformers import (
AutoProcessor,
DeepseekOcr2Config,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import Expectations, cleanup, require_torch, slow, torch_device
from ...test_processing_common import url_to_local_path
from ...vlm_tester import VLMModelTest, VLMModelTester
if is_torch_available():
import torch
from transformers import (
DeepseekOcr2ForConditionalGeneration,
DeepseekOcr2Model,
)
from transformers.models.deepseek_ocr2.configuration_deepseek_ocr2 import (
DeepseekOcr2TextConfig,
DeepseekOcr2VisionConfig,
)
if is_vision_available():
from transformers.image_utils import load_image
class DeepseekOcr2VisionText2TextModelTester(VLMModelTester):
base_model_class = DeepseekOcr2Model
config_class = DeepseekOcr2Config
conditional_generation_class = DeepseekOcr2ForConditionalGeneration
text_config_class = DeepseekOcr2TextConfig
vision_config_class = DeepseekOcr2VisionConfig
def __init__(self, parent, **kwargs):
# VisionModel always selects query_768_resolution (144 tokens) for small images + 1 separator
kwargs.setdefault("num_image_tokens", 145)
kwargs.setdefault("image_token_id", 1)
kwargs.setdefault("image_size", 16)
kwargs.setdefault("hidden_size", 128)
kwargs.setdefault("intermediate_size", 256)
kwargs.setdefault("num_hidden_layers", 2)
kwargs.setdefault("num_attention_heads", 4)
kwargs.setdefault("num_key_value_heads", 4)
kwargs.setdefault("hidden_act", "silu")
kwargs.setdefault("max_position_embeddings", 512)
kwargs.setdefault("tie_word_embeddings", False)
kwargs.setdefault("bos_token_id", 2)
kwargs.setdefault("eos_token_id", 3)
kwargs.setdefault("pad_token_id", 4)
kwargs.setdefault("n_routed_experts", 8)
kwargs.setdefault("n_shared_experts", 1)
kwargs.setdefault("mlp_layer_types", ["dense", "sparse"])
kwargs.setdefault("moe_intermediate_size", 64)
kwargs.setdefault("num_experts_per_tok", 2)
super().__init__(parent, **kwargs)
self.sam_config = {
"hidden_size": 32,
"output_channels": 16,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_channels": 3,
"image_size": 16,
"patch_size": 2,
"hidden_act": "gelu",
"mlp_ratio": 4.0,
"window_size": 4,
"global_attn_indexes": [1],
"downsample_channels": [32, 64],
}
self.encoder_config = {
"hidden_size": 64,
"intermediate_size": 128,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_key_value_heads": 4,
"hidden_act": "silu",
"max_position_embeddings": 512,
"rms_norm_eps": 1.0,
}
def get_vision_config(self):
return DeepseekOcr2VisionConfig(
sam_config=self.sam_config,
encoder_config=self.encoder_config,
)
def get_config(self):
return self.config_class(
vision_config=self.get_vision_config(),
text_config=self.get_text_config(),
image_token_id=self.image_token_id,
)
@require_torch
class DeepseekOcr2ModelTest(VLMModelTest, unittest.TestCase):
model_tester_class = DeepseekOcr2VisionText2TextModelTester
test_all_params_have_gradient = False
test_torch_exportable = False
@unittest.skip(
reason="DeepseekOcr2VisionModel builds a hybrid bidirectional+causal mask internally, so SDPA is always called with a non-null `attn_mask`."
)
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip(
reason="DeepseekOcr2VisionModel uses `self.query_*.weight` directly, causing device mismatch when offloading."
)
def test_cpu_offload(self):
pass
@unittest.skip(
reason="DeepseekOcr2VisionModel uses `self.query_*.weight` directly, causing device mismatch when offloading."
)
def test_disk_offload_bin(self):
pass
@unittest.skip(
reason="DeepseekOcr2VisionModel uses `self.query_*.weight` directly, causing device mismatch when offloading."
)
def test_disk_offload_safetensors(self):
pass
def _image_features_prepare_config_and_inputs(self):
config, inputs_dict = super()._image_features_prepare_config_and_inputs()
# test_get_image_features_output expects vision_config.hidden_size, but ours is in encoder_config.
config.vision_config.hidden_size = config.vision_config.encoder_config.hidden_size
return config, inputs_dict
@require_torch
class DeepseekOcr2IntegrationTest(unittest.TestCase):
model_id = "deepseek-community/DeepSeek-OCR-2"
def setUp(self):
self.processor = AutoProcessor.from_pretrained(self.model_id)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
def test_small_model_integration_test_free_ocr(self):
model = DeepseekOcr2ForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype=torch.bfloat16, device_map=torch_device
)
image = load_image(
url_to_local_path(
"https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
)
)
inputs = self.processor(images=image, text="<image>\nFree OCR.", return_tensors="pt").to(
model.device, dtype=torch.bfloat16
)
generate_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
decoded = self.processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
EXPECTED_DECODED_TEXT = Expectations(
{
("cuda", None): "R&D QUALITY IMPROVEMENT SUGGESTION/SOLUTION FORM\n\nName/",
}
).get_expectation() # fmt: skip
self.assertEqual(decoded, EXPECTED_DECODED_TEXT)
@slow
def test_small_model_integration_test_grounding_markdown(self):
model = DeepseekOcr2ForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype=torch.bfloat16, device_map=torch_device
)
image = load_image(
url_to_local_path(
"https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
)
)
inputs = self.processor(
images=image,
text="<image>\n<|grounding|>Convert the document to markdown.",
return_tensors="pt",
).to(model.device, dtype=torch.bfloat16)
generate_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
decoded = self.processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=False)
EXPECTED_DECODED_TEXT = Expectations(
{
("cuda", None): "<|ref|>title<|/ref|><|det|>[[330, 198, 559, 230]]<|/det|>\n# R",
}
).get_expectation() # fmt: skip
self.assertEqual(decoded, EXPECTED_DECODED_TEXT)
@slow
def test_small_model_integration_test_batched(self):
model = DeepseekOcr2ForConditionalGeneration.from_pretrained(
self.model_id, torch_dtype=torch.bfloat16, device_map=torch_device
)
image1 = load_image(
url_to_local_path(
"https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
)
)
image2 = load_image(
url_to_local_path(
"https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
)
)
inputs = self.processor(
images=[image1, image2],
text=["<image>\nFree OCR.", "<image>\nFree OCR."],
return_tensors="pt",
padding=True,
).to(model.device, dtype=torch.bfloat16)
generate_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
decoded = self.processor.batch_decode(
generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
EXPECTED_DECODED_TEXT = Expectations(
{
("cuda", None): [
"R&D QUALITY IMPROVEMENT SUGGESTION/SOLUTION FORM\n\nName/",
"# Reducing the number of images\n\nIt is also believed that the performance of a website is a critical",
],
}
).get_expectation() # fmt: skip
self.assertEqual(decoded, EXPECTED_DECODED_TEXT)

View File

@@ -0,0 +1,90 @@
# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import torch
from transformers import DeepseekOcr2Processor
from transformers.testing_utils import require_vision
from ...test_processing_common import ProcessorTesterMixin
@require_vision
class DeepseekOcr2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = DeepseekOcr2Processor
@classmethod
def _setup_image_processor(cls):
image_processor_class = cls._get_component_class_from_processor("image_processor")
image_processor = image_processor_class()
image_processor.size = {"height": 64, "width": 64}
image_processor.tile_size = 512
return image_processor
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
tokenizer = tokenizer_class.from_pretrained("deepseek-community/DeepSeek-OCR-2")
return tokenizer
@classmethod
def _setup_test_attributes(cls, processor):
cls.image_token = processor.image_token
@unittest.skip("DeepseekOcr2Processor pops the image processor output 'num_local_patches'")
def test_image_processor_defaults(self):
pass
def test_image_token_expansion_small_image(self):
"""Small image (< tile_size) should produce no local patches → 257 image tokens."""
processor = self.get_processor()
processor.image_processor.size = {"height": 1024, "width": 1024}
processor.image_processor.tile_size = 768
# Small image: max(200, 300) < 768 → no local patches
image = torch.randint(0, 256, (3, 300, 200), dtype=torch.uint8)
prompt = "<image>\nFree OCR."
inputs = processor(images=image, text=prompt, return_tensors="pt")
image_token_id = processor.image_token_id
num_image_tokens = (inputs["input_ids"] == image_token_id).sum().item()
# 257 = 256 global + 0 local + 1 separator
self.assertEqual(num_image_tokens, 257)
self.assertNotIn("pixel_values_local", inputs)
def test_image_token_expansion_large_image(self):
"""Large image should produce local patches → more image tokens."""
processor = self.get_processor()
processor.image_processor.size = {"height": 1024, "width": 1024}
processor.image_processor.tile_size = 768
# Large image: max(2448, 3264) > 768 → local patches
image = torch.randint(0, 256, (3, 3264, 2448), dtype=torch.uint8)
prompt = "<image>\nFree OCR."
inputs = processor(images=image, text=prompt, return_tensors="pt")
image_token_id = processor.image_token_id
num_image_tokens = (inputs["input_ids"] == image_token_id).sum().item()
num_local_patches = inputs["num_local_patches"][0]
# 3264x2448 image produces 6 local patches (2x3 grid) + 1 global view = 7 total
# num_image_tokens = 256 global + 144*6 local + 1 separator = 1121
self.assertEqual(num_local_patches, 6)
self.assertEqual(num_image_tokens, 1121)
self.assertIn("pixel_values_local", inputs)