first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/pipelines/test_pipelines_image_text_to_text.py
+++ b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -0,0 +1,506 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import unittest
+
+from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available
+from transformers.pipelines import ImageTextToTextPipeline, pipeline
+from transformers.testing_utils import (
+    Expectations,
+    is_pipeline_test,
+    require_deterministic_for_xpu,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+class ImageTextToTextPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+    def get_test_pipeline(self, model, tokenizer, processor, image_processor, dtype="float32"):
+        pipe = ImageTextToTextPipeline(model=model, processor=processor, dtype=dtype, max_new_tokens=10)
+        image_token = getattr(processor.tokenizer, "image_token", "")
+        examples = [
+            {
+                "images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "text": f"{image_token}This is a ",
+            },
+            {
+                "images": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "text": f"{image_token}Here I see a ",
+            },
+        ]
+        return pipe, examples
+
+    def run_pipeline_test(self, pipe, examples):
+        outputs = pipe(examples[0].get("images"), text=examples[0].get("text"))
+        self.assertEqual(
+            outputs,
+            [
+                {"input_text": ANY(str), "generated_text": ANY(str)},
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt_token_text_only(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        text = "What is the capital of France? Assistant:"
+
+        outputs = pipe(text=text)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "What is the capital of France? Assistant:",
+                    "generated_text": "What is the capital of France? Assistant: The capital of France is Paris.",
+                }
+            ],
+        )
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Write a poem on Hugging Face, the company"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is the capital of France?"},
+                    ],
+                },
+            ],
+        ]
+        outputs = pipe(text=messages)
+        EXPECTED_CONTENT = Expectations(
+            {
+                (
+                    "cuda",
+                    8,
+                ): "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and",
+                (
+                    "rocm",
+                    (9, 4),
+                ): "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they do it all\n\nThey help us to create and share\nContent that is both true and true\nAnd make the world a better place\nWith their tools and services, we can do it all\n\nFrom image and video to text and speech\nThey make it all possible\nWith their tools and services, we can do it all\nAnd make the world a better place\n\nSo let us embrace and use\nHugging Face's tools and services\nTo create and share\nContent that is true and true\nAnd make the world a better place.",
+                (
+                    "xpu",
+                    3,
+                ): "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they do it all\n\nThey help us to create and share\nContent that's both engaging and informative\nWith their tools, we can write\nAnd create stories that are both true and true\n\nThey help us to analyze\nAnd make sense of data that's hard to see\nWith their tools, we can see\nAnd make sense of the world around us\n\nThey help us to create\nAnd share content that's both true and true\nWith their tools, we can see\nAnd make sense of the world around us\n\nSo here's to Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they do it all\n\nThank you, Hugging Face, for all you do\nWith tools and services that make our lives easier\nSo here's to you, and all the great things you do",
+            }
+        ).get_expectation()
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {
+                        "input_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                            }
+                        ],
+                        "generated_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                            },
+                            {
+                                "role": "assistant",
+                                "content": EXPECTED_CONTENT,
+                            },
+                        ],
+                    }
+                ],
+                [
+                    {
+                        "input_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "What is the capital of France?"}],
+                            }
+                        ],
+                        "generated_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "What is the capital of France?"}],
+                            },
+                            {"role": "assistant", "content": "Paris"},
+                        ],
+                    }
+                ],
+            ],
+        )
+
+    @require_torch
+    @require_deterministic_for_xpu
+    def test_small_model_pt_token(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        text = "<image> What this is? Assistant: This is"
+
+        outputs = pipe(image, text=text)
+        EXPECTED_CONTENT = Expectations(
+            {
+                (
+                    "cuda",
+                    8,
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable. The photo captures a moment of tranquility and companionship between the two feline friends.",
+                (
+                    "rocm",
+                    (9, 4),
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a couch, and the cats are positioned in such a way that they are facing the camera. The image captures a peaceful moment between the two cats, and it's a great way to showcase their cuteness and relaxed demeanor.",
+                (
+                    "xpu",
+                    3,
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a surface that looks like a couch or a chair, and it is covered with a soft fabric. The cats' fur is a mix of black, white, and brown, and they have a variety of patterns on their bodies. The image captures a moment of tranquility and companionship between the cats.",
+            }
+        ).get_expectation()
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": EXPECTED_CONTENT,
+                }
+            ],
+        )
+
+        outputs = pipe([image, image], text=[text, text])
+        EXPECTED_CONTENT = Expectations(
+            {
+                (
+                    "cuda",
+                    8,
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a couch, and the cats are positioned in such a way that they are facing the camera. The image captures a peaceful moment between the two cats, and it's a great way to showcase their cuteness and relaxed demeanor.",
+                (
+                    "rocm",
+                    (9, 4),
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a couch, and the overall setting is cozy and comfortable.",
+                (
+                    "xpu",
+                    3,
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a surface that looks like a couch or a chair, and it is covered with a soft fabric. The cats' fur is a mix of black, white, and brown, and they have a variety of patterns on their bodies. The image captures a moment of tranquility and companionship between the cats.",
+            }
+        ).get_expectation()
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": EXPECTED_CONTENT,
+                },
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": EXPECTED_CONTENT,
+                },
+            ],
+        )
+
+    @require_torch
+    def test_consistent_batching_behaviour(self):
+        pipe = pipeline("image-text-to-text", model="microsoft/kosmos-2-patch14-224")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        prompt = "a photo of"
+
+        outputs = pipe([image, image], text=[prompt, prompt], max_new_tokens=10)
+        outputs_batched = pipe([image, image], text=[prompt, prompt], batch_size=2, max_new_tokens=10)
+        self.assertEqual(outputs, outputs_batched)
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_with_response_parsing(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's the difference between these two images?"},
+                    {
+                        "type": "image",
+                        "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                    },
+                    {
+                        "type": "image",
+                        "url": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                    },
+                ],
+            }
+        ]
+        pipe.tokenizer.response_schema = {
+            # A real response schema should probably have things like "role" and "content"
+            # and "reasoning_content" but it's unlikely we'd get a tiny model to reliably
+            # output anything like that, so let's keep it simple.
+            "type": "object",
+            "properties": {
+                "first_word": {"type": "string", "x-regex": r"^\s*([a-zA-Z]+)"},
+                "last_word": {"type": "string", "x-regex": r"([a-zA-Z]+)\s*$"},
+            },
+        }
+        outputs = pipe(text=messages, do_sample=False, max_new_tokens=10)
+        parsed_message = outputs[0]["generated_text"][-1]
+        # The parsed message should be a dict with the schema keys, not {"role": "assistant", "content": ...}
+        self.assertIn("first_word", parsed_message)
+        self.assertIn("last_word", parsed_message)
+        self.assertNotIn("role", parsed_message)
+        self.assertIsInstance(parsed_message["first_word"], str)
+        self.assertIsInstance(parsed_message["last_word"], str)
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image_ny = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        image_chicago = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between these two images?"},
+                    {"type": "image"},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        # Deprecated behavior should raise an error after v5
+        with self.assertRaises(ValueError):
+            outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=True, max_new_tokens=10)
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between these two images?"},
+                    {"type": "image", "url": image_ny},
+                    {"type": "image", "url": image_chicago},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=True, max_new_tokens=10)
+        EXPECTED_CONTENT = Expectations(
+            {
+                ("rocm", (9, 4)): "The first image shows a statue of the Statue of",
+                ("cuda", 8): "The first image shows a statue of Liberty in the",
+                ("xpu", 3): "The first image shows a statue of Liberty in the",
+            }
+        ).get_expectation()
+
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {
+                                    "type": "image",
+                                    "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                                },
+                                {
+                                    "type": "image",
+                                    "url": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                                },
+                            ],
+                        }
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {
+                                    "type": "image",
+                                    "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                                },
+                                {
+                                    "type": "image",
+                                    "url": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                                },
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": EXPECTED_CONTENT,
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_continue_final_message(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": "There is a dog and"},
+                ],
+            },
+        ]
+        outputs = pipe(text=messages, max_new_tokens=10)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {"role": "assistant", "content": [{"type": "text", "text": "There is a dog and"}]},
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "There is a dog and a person in the image. The dog is sitting",
+                                }
+                            ],
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_new_text(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        }
+                    ],
+                    "generated_text": "In the image, a woman is sitting on the",
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_image_url(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image in one sentence."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
+        self.assertEqual(outputs, "A statue of liberty in the foreground of a city")
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_image_url_base64(self):
+        with open("./tests/fixtures/tests_samples/COCO/000000039769.png", "rb") as image_file:
+            base64_image = base64.b64encode(image_file.read()).decode("utf-8")
+
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                    },
+                    {"type": "text", "text": "Describe this image in one sentence."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
+        self.assertEqual(outputs, "Two cats are sleeping on a pink blanket, with")