first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/perception_lm/test_processing_perception_lm.py
+++ b/tests/models/perception_lm/test_processing_perception_lm.py
@@ -0,0 +1,157 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import unittest
+
+from transformers import (
+    PerceptionLMProcessor,
+)
+from transformers.testing_utils import require_vision
+from transformers.utils import is_torch_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+TEST_MODEL_PATH = "facebook/Perception-LM-1B"
+
+
+@require_vision
+@unittest.skip("Requires read token and we didn't requests access yet. FIXME @ydshieh when you are back :)")
+class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = PerceptionLMProcessor
+
+    @classmethod
+    def _setup_image_processor(cls):
+        image_processor_class = cls._get_component_class_from_processor("image_processor")
+        return image_processor_class(tile_size=448, max_num_tiles=4, vision_input_type="thumb+tile")
+
+    @classmethod
+    def _setup_tokenizer(cls):
+        tokenizer_class = cls._get_component_class_from_processor("tokenizer")
+        tokenizer = tokenizer_class.from_pretrained(TEST_MODEL_PATH)
+        tokenizer.add_special_tokens({"additional_special_tokens": ["<|image|>", "<|video|>"]})
+
+    @classmethod
+    def _setup_test_attributes(cls, processor):
+        cls.image_token_id = processor.image_token_id
+        cls.video_token_id = processor.video_token_id
+
+    @staticmethod
+    def prepare_processor_dict():
+        return {
+            "chat_template": CHAT_TEMPLATE,
+            "patch_size": 14,
+            "pooling_ratio": 2,
+        }  # fmt: skip
+
+    def test_chat_template_is_saved(self):
+        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
+        processor_dict_loaded = json.loads(processor_loaded.to_json_string())
+        # chat templates aren't serialized to json in processors
+        self.assertFalse("chat_template" in processor_dict_loaded)
+
+        # they have to be saved as separate file and loaded back from that file
+        # so we check if the same template is loaded
+        processor_dict = self.prepare_processor_dict()
+        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
+
+    def test_image_token_filling(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        # Important to check with non square image
+        image = torch.randn((1, 3, 450, 500))
+        #  5 tiles (thumbnail tile + 4 tiles)
+        #  448/patch_size/pooling_ratio = 16 => 16*16 tokens per tile
+        expected_image_tokens = 16 * 16 * 5
+        image_token_index = processor.image_token_id
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        inputs = processor(
+            text=[processor.apply_chat_template(messages)],
+            images=[image],
+            return_tensors="pt",
+        )
+        image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
+        self.assertEqual(expected_image_tokens, image_tokens)
+        self.assertEqual(inputs["pixel_values"].ndim, 5)
+
+    def test_vanilla_image_with_no_tiles_token_filling(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        processor.image_processor.vision_input_type = "vanilla"
+        # Important to check with non square image
+        image = torch.randn((1, 3, 450, 500))
+        #  1 tile
+        #  448/patch_size/pooling_ratio = 16 => 16*16 tokens per tile
+        expected_image_tokens = 16 * 16 * 1
+        image_token_index = processor.image_token_id
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        inputs = processor(
+            text=[processor.apply_chat_template(messages)],
+            images=[image],
+            return_tensors="pt",
+        )
+        image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
+        self.assertEqual(expected_image_tokens, image_tokens)
+        self.assertEqual(inputs["pixel_values"].ndim, 5)
+        self.assertEqual(inputs["pixel_values"].shape[1], 1)  # 1 tile
+
+
+CHAT_TEMPLATE = (
+    "{{- bos_token }}"
+    "{%- if messages[0]['role'] == 'system' -%}"
+    "    {%- set system_message = messages[0]['content']|trim %}\n"
+    "    {%- set messages = messages[1:] %}\n"
+    "{%- else %}"
+    "    {%- set system_message = 'You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.' %}"
+    "{%- endif %}"
+    "{{- '<|start_header_id|>system<|end_header_id|>\\n\\n' }}"
+    "{{- system_message }}"
+    "{{- '<|eot_id|>' }}"
+    "{%- for message in messages %}"
+    "{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
+    "{{ '<|image|>' }}"
+    "{%- endfor %}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'video') %}"
+    "{{ '<|video|>' }}"
+    "{%- endfor %}"
+    "{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
+    "{{- content['text'] | trim }}"
+    "{%- endfor %}"
+    "{{'<|eot_id|>' }}"
+    "{%- endfor %}"
+    "{%- if add_generation_prompt %}"
+    "{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"
+    "{%- endif %}"
+)