first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/fuyu/init.py
+++ b/tests/models/fuyu/init.py
--- a/tests/models/fuyu/test_image_processing_fuyu.py
+++ b/tests/models/fuyu/test_image_processing_fuyu.py
@@ -0,0 +1,447 @@
+import io
+import unittest
+
+import httpx
+import numpy as np
+import pytest
+
+from transformers.image_utils import SizeDict
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_accelerator,
+    require_torchvision,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin
+
+
+if is_torch_available() and is_vision_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+class FuyuImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_pad=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        patch_size=None,
+    ):
+        size = size if size is not None else {"height": 180, "width": 360}
+        patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = 30
+        self.max_resolution = 360
+        self.do_resize = do_resize
+        self.size = size
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.patch_size = patch_size
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_pad": self.do_pad,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "patch_size": self.patch_size,
+        }
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        """Prepares a batch of images for testing"""
+        if equal_resolution:
+            image_inputs = [
+                np.random.randint(
+                    0, 256, (self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
+                )
+                for _ in range(self.batch_size)
+            ]
+        else:
+            heights = [
+                h - (h % 30) for h in np.random.randint(self.min_resolution, self.max_resolution, self.batch_size)
+            ]
+            widths = [
+                w - (w % 30) for w in np.random.randint(self.min_resolution, self.max_resolution, self.batch_size)
+            ]
+
+            image_inputs = [
+                np.random.randint(0, 256, (self.num_channels, height, width), dtype=np.uint8)
+                for height, width in zip(heights, widths)
+            ]
+
+        if not numpify and not torchify:
+            image_inputs = [Image.fromarray(np.moveaxis(img, 0, -1)) for img in image_inputs]
+
+        if torchify:
+            image_inputs = [torch.from_numpy(img) for img in image_inputs]
+
+        return image_inputs
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+
+@require_torch
+@require_vision
+@require_torchvision
+class FuyuImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
+    # Skip tests that expect pixel_values output
+    test_cast_dtype = None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = FuyuImageProcessingTester(self)
+        self.image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_call_pil(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_numpy(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_pytorch(self):
+        """Override to handle Fuyu's custom output structure"""
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**self.image_processor_dict)
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), 1)
+
+            encoded_images = image_processing(image_inputs, return_tensors="pt")
+            self.assertIn("images", encoded_images)
+            self.assertEqual(len(encoded_images.images), self.image_processor_tester.batch_size)
+
+    def test_call_numpy_4_channels(self):
+        """Skip this test as Fuyu doesn't support arbitrary channels"""
+        self.skipTest("Fuyu processor is designed for 3-channel RGB images")
+
+    def test_backends_equivalence(self):
+        """Override to handle Fuyu's custom output structure"""
+        if len(self.image_processing_classes) < 2:
+            self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
+
+        dummy_image = Image.open(
+            io.BytesIO(
+                httpx.get("http://images.cocodataset.org/val2017/000000039769.jpg", follow_redirects=True).content
+            )
+        )
+
+        encodings = {}
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            encodings[backend_name] = image_processor(dummy_image, return_tensors="pt")
+
+        backend_names = list(encodings.keys())
+        reference_encoding = encodings[backend_names[0]].images[0][0]
+        for backend_name in backend_names[1:]:
+            self._assert_tensors_equivalence(reference_encoding, encodings[backend_name].images[0][0])
+
+    def test_backends_equivalence_batched(self):
+        """Override to handle Fuyu's custom output structure"""
+        if len(self.image_processing_classes) < 2:
+            self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+        encodings = {}
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            encodings[backend_name] = image_processor(dummy_images, return_tensors="pt")
+
+        backend_names = list(encodings.keys())
+        reference_images = encodings[backend_names[0]].images
+        for backend_name in backend_names[1:]:
+            for ref_img, other_img in zip(reference_images, encodings[backend_name].images):
+                self._assert_tensors_equivalence(ref_img[0], other_img[0])
+
+    @slow
+    @require_torch_accelerator
+    @require_vision
+    @pytest.mark.torch_compile_test
+    def test_can_compile_torchvision_backend(self):
+        """Override to handle Fuyu's custom output structure (images instead of pixel_values)."""
+        if "torchvision" not in self.image_processing_classes:
+            self.skipTest("Skipping compilation test as torchvision backend is not available")
+
+        torch.compiler.reset()
+        input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
+        image_processor = self.image_processing_classes["torchvision"](**self.image_processor_dict)
+        output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+        image_processor = torch.compile(image_processor, mode="reduce-overhead")
+        output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
+        self._assert_tensors_equivalence(
+            output_eager.images[0][0], output_compiled.images[0][0], atol=1e-4, rtol=1e-4, mean_atol=1e-5
+        )
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_resize"))
+            self.assertTrue(hasattr(image_processor, "size"))
+            self.assertTrue(hasattr(image_processor, "do_pad"))
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "image_mean"))
+            self.assertTrue(hasattr(image_processor, "image_std"))
+            self.assertTrue(hasattr(image_processor, "do_rescale"))
+            self.assertTrue(hasattr(image_processor, "rescale_factor"))
+            self.assertTrue(hasattr(image_processor, "patch_size"))
+
+    def test_patches(self):
+        """Test that patchify_image produces the expected number of patches."""
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            batch_size = 3
+            channels = 3
+            height = 300
+            width = 300
+            image_input = torch.rand(batch_size, channels, height, width)
+
+            expected_num_patches = image_processor.get_num_patches(image_height=height, image_width=width)
+            patches_final = image_processor.patchify_image(image=image_input)
+
+            self.assertEqual(patches_final.shape[1], expected_num_patches)
+
+    def test_patches_match_backends(self):
+        """Test that backends produce same patches."""
+        if len(self.image_processing_classes) < 2:
+            self.skipTest(reason="Skipping backends patch equivalence test as there are less than 2 backends")
+
+        batch_size = 3
+        channels = 3
+        height = 300
+        width = 300
+        image_input = torch.rand(batch_size, channels, height, width)
+
+        processors = {}
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            processors[backend_name] = image_processing_class(**self.image_processor_dict)
+
+        backend_names = list(processors.keys())
+        reference_patches = processors[backend_names[0]].patchify_image(image=image_input)
+        for backend_name in backend_names[1:]:
+            patches = processors[backend_name].patchify_image(image=image_input)
+            self.assertEqual(reference_patches.shape, patches.shape)
+            torch.testing.assert_close(reference_patches, patches, rtol=1e-4, atol=1e-4)
+
+    def test_scale_to_target_aspect_ratio(self):
+        """Test that resize maintains aspect ratio correctly."""
+        sample_image = np.zeros((3, 450, 210), dtype=np.uint8)
+
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            if backend_name == "pil":
+                scaled_image = image_processor.resize(sample_image, size=SizeDict(**self.image_processor_dict["size"]))
+                self.assertEqual(scaled_image.shape[1], 180)
+                self.assertEqual(scaled_image.shape[2], 84)
+            elif backend_name == "torchvision":
+                sample_tensor = torch.from_numpy(sample_image).float()
+                size_dict = SizeDict(
+                    height=self.image_processor_dict["size"]["height"],
+                    width=self.image_processor_dict["size"]["width"],
+                )
+                scaled_image = image_processor.resize(sample_tensor, size=size_dict)
+                self.assertEqual(scaled_image.shape[1], 180)
+                self.assertEqual(scaled_image.shape[2], 84)
+
+    def test_apply_transformation_numpy(self):
+        """Test preprocessing with numpy input."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            transformed_image = image_processor.preprocess(sample_image).images[0][0]
+            self.assertEqual(transformed_image.shape[1], 180)
+            self.assertEqual(transformed_image.shape[2], 360)
+
+    def test_apply_transformation_pil(self):
+        """Test preprocessing with PIL input."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        sample_image_pil = Image.fromarray(sample_image)
+
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            transformed_image = image_processor.preprocess(sample_image_pil).images[0][0]
+            self.assertEqual(transformed_image.shape[1], 180)
+            self.assertEqual(transformed_image.shape[2], 360)
+
+    def test_preprocess_output_structure(self):
+        """Test that preprocess returns correct output structure."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            result = image_processor.preprocess(sample_image)
+
+            self.assertIn("images", result)
+            self.assertIn("image_unpadded_heights", result)
+            self.assertIn("image_unpadded_widths", result)
+            self.assertIn("image_scale_factors", result)
+
+            self.assertEqual(len(result.images), 1)
+            self.assertEqual(len(result.images[0]), 1)
+            self.assertEqual(len(result.image_unpadded_heights), 1)
+            self.assertEqual(len(result.image_unpadded_widths), 1)
+            self.assertEqual(len(result.image_scale_factors), 1)
+
+    def test_batch_processing(self):
+        """Test processing multiple images."""
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        sample_image_pil = Image.fromarray(sample_image)
+        images = [sample_image, sample_image_pil]
+
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            result = image_processor.preprocess(images)
+
+            self.assertEqual(len(result.images), 2)
+            for img in result.images:
+                self.assertEqual(len(img), 1)
+                if hasattr(img[0], "shape"):
+                    if len(img[0].shape) == 3:
+                        self.assertEqual(img[0].shape[1], 180)
+                        self.assertEqual(img[0].shape[2], 360)
+
+    def test_pad_image_torchvision(self):
+        """Test that padding works correctly for torchvision backend."""
+        if "torchvision" not in self.image_processing_classes:
+            self.skipTest(reason="Torchvision backend not available")
+
+        from transformers.image_utils import SizeDict
+
+        image_processor = self.image_processing_classes["torchvision"](**self.image_processor_dict)
+
+        small_image = torch.rand(3, 100, 100)
+        size_dict = SizeDict(height=180, width=360)
+
+        padded = image_processor.pad([small_image], pad_size=size_dict, fill_value=1.0)[0]
+        self.assertEqual(padded.shape[1], 180)
+        self.assertEqual(padded.shape[2], 360)
+
+        self.assertTrue(torch.allclose(padded[:, 100:, :], torch.ones_like(padded[:, 100:, :])))
+        self.assertTrue(torch.allclose(padded[:, :, 100:], torch.ones_like(padded[:, :, 100:])))
+
+    def test_preprocess_with_tokenizer_info(self):
+        """Test preprocess_with_tokenizer_info functionality."""
+        batch_size = 2
+        subseq_size = 1
+        channels = 3
+        image_input = torch.rand(batch_size, subseq_size, channels, 180, 360)
+        image_present = torch.ones(batch_size, subseq_size, dtype=torch.bool)
+        image_unpadded_h = torch.tensor([[180], [180]])
+        image_unpadded_w = torch.tensor([[360], [360]])
+
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+
+            result = image_processor.preprocess_with_tokenizer_info(
+                image_input=image_input,
+                image_present=image_present,
+                image_unpadded_h=image_unpadded_h,
+                image_unpadded_w=image_unpadded_w,
+                image_placeholder_id=100,
+                image_newline_id=101,
+                variable_sized=True,
+            )
+
+            # Check output structure
+            self.assertIn("images", result)
+            self.assertIn("image_input_ids", result)
+            self.assertIn("image_patches", result)
+            self.assertIn("image_patch_indices_per_batch", result)
+            self.assertIn("image_patch_indices_per_subsequence", result)
+
+            # Check batch structure
+            self.assertEqual(len(result.images), batch_size)
+            self.assertEqual(len(result.image_input_ids), batch_size)
+            self.assertEqual(len(result.image_patches), batch_size)
+
+    def test_device_handling_torchvision(self):
+        """Test that torchvision backend can handle device placement."""
+        if "torchvision" not in self.image_processing_classes:
+            self.skipTest(reason="Torchvision backend not available")
+
+        sample_image = np.zeros((450, 210, 3), dtype=np.uint8)
+        image_processor = self.image_processing_classes["torchvision"](**self.image_processor_dict)
+
+        if torch.cuda.is_available():
+            result_cuda = image_processor.preprocess(sample_image, device="cuda")
+            self.assertEqual(result_cuda.images[0][0].device.type, "cuda")
+
+        result_cpu = image_processor.preprocess(sample_image, device="cpu")
+        self.assertEqual(result_cpu.images[0][0].device.type, "cpu")
+
+    def test_do_not_resize_if_smaller(self):
+        """Test that images smaller than target size are not resized."""
+        if "torchvision" not in self.image_processing_classes:
+            self.skipTest(reason="Torchvision backend not available")
+
+        image_processor = self.image_processing_classes["torchvision"](**self.image_processor_dict)
+
+        small_image = torch.rand(3, 100, 150)
+        size_dict = SizeDict(height=180, width=360)
+
+        resized = image_processor.resize(small_image, size=size_dict)
+
+        self.assertEqual(resized.shape[1], 100)
+        self.assertEqual(resized.shape[2], 150)
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -0,0 +1,359 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Fuyu model."""
+
+import copy
+import io
+import unittest
+from functools import cached_property
+
+import pytest
+import requests
+import torch
+from parameterized import parameterized
+
+from transformers import FuyuConfig, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+if is_torch_available() and is_vision_available():
+    from transformers import FuyuProcessor
+
+
+if is_torch_available():
+    from transformers import FuyuForCausalLM, FuyuModel
+
+
+class FuyuModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        num_image_tokens=2,
+        image_size=30,
+        patch_size=15,
+        num_channels=3,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=10,
+        image_token_id=1,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_image_tokens = num_image_tokens
+        self.seq_length = seq_length + num_image_tokens
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.image_token_id = image_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids[input_ids == config.image_token_id] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_id
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        sequence_labels = None
+        token_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+
+        return config, input_ids, input_mask, sequence_labels, token_labels
+
+    def get_config(self):
+        return FuyuConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            image_token_id=self.image_token_id,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+        ) = config_and_inputs
+        image_patches = floats_tensor(
+            [self.batch_size, self.num_image_tokens, config.num_channels * config.patch_size**2]
+        )
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask, "image_patches": image_patches}
+        return config, inputs_dict
+
+
+@require_torch
+class FuyuModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            FuyuModel,
+            FuyuForCausalLM,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
+    )
+
+    test_cpu_offload = False
+    test_disk_offload = False
+
+    def setUp(self):
+        self.model_tester = FuyuModelTester(self)
+
+    def test_mismatching_image_patches(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            curr_input_dict = copy.deepcopy(input_dict)  # in=place modifications further
+
+            # two image token and two image
+            _ = model(**curr_input_dict)  # successful forward with no modifications
+
+            # remove one image but leave the image token in text
+            input_ids = curr_input_dict["input_ids"]
+            image_patches = curr_input_dict["image_patches"][1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, image_patches=image_patches)
+
+            # remove one image token from text
+            input_ids = curr_input_dict["input_ids"][2:]
+            image_patches = curr_input_dict["image_patches"]
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, image_patches=image_patches)
+
+    @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
+    @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices")
+    def test_assisted_decoding_matches_greedy_search(self):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices")
+    def test_assisted_decoding_sample(self):
+        pass
+
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip(reason="Does not work on the tiny model.")
+    def test_disk_offload_bin(self):
+        super().test_disk_offload()
+
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip(reason="Does not work on the tiny model.")
+    def test_disk_offload_safetensors(self):
+        super().test_disk_offload()
+
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip(reason="Does not work on the tiny model.")
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
+    @unittest.skip(reason="Fuyu `prepare_inputs_for_generation` function doesn't have cache position.")
+    def test_generate_continue_from_inputs_embeds(self):
+        pass
+
+    @unittest.skip("Persimmon backbone applies key/query norm which doesn't work with packing")
+    def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+    @unittest.skip("Persimmon backbone applies key/query norm which doesn't work with packing")
+    def test_flash_attention_2_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self):
+        pass
+
+    @unittest.skip("Persimmon backbone applies key/query norm which doesn't work with packing")
+    def test_eager_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+    @unittest.skip("Persimmon backbone applies key/query norm which doesn't work with packing")
+    def test_sdpa_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+    @unittest.skip(reason="Fuyu has no separate base model without a head.")
+    def test_model_base_model_prefix(self):
+        pass
+
+    def _image_features_prepare_config_and_inputs(self):
+        """
+        Helper method to extract only image-related inputs from the full set of inputs, for testing `get_image_features`.
+
+        The Fuyu model uses image_patches, except for get_image_features, where they're called pixel_values.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        inputs_dict = {"pixel_values": inputs_dict["image_patches"]}
+        return config, inputs_dict
+
+    @unittest.skip("Skip get_image_features tests as Fuyu's image features originate from a simple Linear")
+    def test_get_image_features_hidden_states(self):
+        pass
+
+    @unittest.skip("Skip get_image_features tests as Fuyu's image features originate from a simple Linear")
+    def test_get_image_features_attentions(self):
+        pass
+
+
+@slow
+@require_torch_accelerator
+class FuyuModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return FuyuProcessor.from_pretrained("adept/fuyu-8b")
+
+    @cached_property
+    def default_model(self):
+        return FuyuForCausalLM.from_pretrained("adept/fuyu-8b", dtype="float16", device_map=torch_device)
+
+    def test_greedy_generation(self):
+        processor = self.default_processor
+        model = self.default_model
+
+        url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
+        image = Image.open(io.BytesIO(requests.get(url).content))
+
+        text_prompt_coco_captioning = "Generate a coco-style caption.\n"
+
+        inputs = processor(images=image, text=text_prompt_coco_captioning, return_tensors="pt").to(
+            torch_device, torch.float16
+        )
+        generated_ids = model.generate(**inputs, max_new_tokens=10)
+
+        # take the last 8 tokens (in order to skip special \n\x04 characters) and decode them
+        generated_text = processor.batch_decode(generated_ids[:, -8:], skip_special_tokens=True)[0]
+        self.assertEqual(generated_text, "A blue bus parked on the side of a road.")
+
+
+"""
+    @slow
+    @require_torch_accelerator
+    def test_model_8b_chat_greedy_generation_bus_color(self):
+        EXPECTED_TEXT_COMPLETION = "The bus is blue.\n|ENDOFTEXT|"
+        text_prompt_bus_color = "What color is the bus?\n"
+        model_inputs_bus_color = self.processor(text=text_prompt_bus_color, images=self.bus_image_pil)
+
+        generated_tokens = self.model.generate(**model_inputs_bus_color, max_new_tokens=10)
+        text = self.processor.tokenizer.batch_decode(generated_tokens)
+        end_sequence = text[0].split("\x04")[1]
+        clean_sequence = (
+            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
+            if "|ENDOFTEXT|" in end_sequence
+            else end_sequence
+        )
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
+
+    @slow
+    @require_torch_accelerator
+    def test_model_8b_chat_greedy_generation_chart_vqa(self):
+        EXPECTED_TEXT_TOKENS = ["The","life expectancy","at","birth","of male","s in","","20","18","is","","80",".","7",".","\n","|ENDOFTEXT|",]  # fmt: skip
+        expected_text_completion = " ".join(EXPECTED_TEXT_TOKENS)  # TODO make sure the end string matches
+
+        text_prompt_chart_vqa = "What is the highest life expectancy at birth of male?\n"
+
+        chart_image_url = (
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/chart.png"
+        )
+        chart_image_pil = Image.open(io.BytesIO(requests.get(chart_image_url).content))
+
+        model_inputs_chart_vqa = self.processor(text=text_prompt_chart_vqa, images=chart_image_pil)
+        generated_tokens = self.model.generate(**model_inputs_chart_vqa, max_new_tokens=10)
+        text = self.processor.tokenizer.batch_decode(generated_tokens)
+        end_sequence = text[0].split("\x04")[1]
+        clean_sequence = (
+            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
+            if "|ENDOFTEXT|" in end_sequence
+            else end_sequence
+        )
+        self.assertEqual(expected_text_completion, clean_sequence)
+
+    @slow
+    @require_torch_accelerator
+    def test_model_8b_chat_greedy_generation_bounding_box(self):
+        EXPECTED_TEXT_COMPLETION = "\x00194213202244\x01|ENDOFTEXT|"
+        text_prompt_bbox = "When presented with a box, perform OCR to extract text contained within it. If provided with text, generate the corresponding bounding box.\\nWilliams"  # noqa: E231
+
+        bbox_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bbox_sample_image.png"
+        bbox_image_pil = Image.open(io.BytesIO(requests.get(bbox_image_url).content))
+
+        model_inputs_bbox = self.processor(text=text_prompt_bbox, images=bbox_image_pil)
+        generated_tokens = self.model.generate(**model_inputs_bbox, max_new_tokens=10)
+        text = self.processor.tokenizer.batch_decode(generated_tokens)
+        end_sequence = text[0].split("\x04")[1]
+        clean_sequence = (
+            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
+            if "|ENDOFTEXT|" in end_sequence
+            else end_sequence
+        )
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
+"""
--- a/tests/models/fuyu/test_processing_fuyu.py
+++ b/tests/models/fuyu/test_processing_fuyu.py
@@ -0,0 +1,418 @@
+import unittest
+
+from transformers import (
+    FuyuImageProcessor,
+    FuyuProcessor,
+    is_torch_available,
+)
+from transformers.image_utils import load_image
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
+
+
+if is_torch_available():
+    import torch
+
+    from transformers.models.fuyu.processing_fuyu import construct_full_unpacked_stream, full_unpacked_stream_to_tensor
+
+
+@require_torch
+@require_vision
+class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = FuyuProcessor
+    model_id = "adept/fuyu-8b"
+
+    @classmethod
+    def _setup_test_attributes(cls, processor):
+        cls.text_prompt = "Generate a coco-style caption.\\n"
+        bus_image_url = url_to_local_path(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
+        )
+        cls.bus_image_pil = load_image(bus_image_url)
+
+    @unittest.skip("FuyuProcessor doesn't return typical pixel values for images")
+    def test_image_processor_defaults(self):
+        pass
+
+    @unittest.skip("FuyuProcessor doesn't return typical pixel values for images")
+    def test_processor_with_multiple_inputs(self):
+        pass
+
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
+    def test_fuyu_processing(self):
+        """
+        Test to ensure that the standard processing on a gold example matches adept's code.
+        """
+        # fmt: off
+        EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
+        EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)
+
+        one_image_bus_model_inputs = self.get_processor()(text=self.text_prompt, images=self.bus_image_pil)
+
+        # fmt: on
+        torch.testing.assert_close(one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS)
+        torch.testing.assert_close(one_image_bus_model_inputs["input_ids"], EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS)
+
+    def test_fuyu_processing_no_image(self):
+        """
+        Test to check processor works with just text input
+        """
+        processor_outputs = self.get_processor()(text=self.text_prompt)
+        tokenizer_outputs = self.get_component("tokenizer")(self.text_prompt)
+        self.assertEqual(processor_outputs["input_ids"], tokenizer_outputs["input_ids"])
+
+    def test_fuyu_processing_no_text(self):
+        """
+        Test to check processor works with just image input
+        """
+        # fmt: off
+        EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([
+            [ 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
+             14,  15,  16,  17,  18,  19,  20,  21,  -1,  22,  23,  24,  25,  26,
+             27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
+             41,  42,  43,  -1,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
+             54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  -1,  66,
+             67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,
+             81,  82,  83,  84,  85,  86,  87,  -1,  88,  89,  90,  91,  92,  93,
+             94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
+             108, 109,  -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+             121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,  -1, 132, 133,
+             134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
+             148, 149, 150, 151, 152, 153,  -1, 154, 155, 156, 157, 158, 159, 160,
+             161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174,
+             175,  -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
+             188, 189, 190, 191, 192, 193, 194, 195, 196, 197,  -1, 198, 199, 200,
+             201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
+             215, 216, 217, 218, 219,  -1, 220, 221, 222, 223, 224, 225, 226, 227,
+             228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+              -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+             255, 256, 257, 258, 259, 260, 261, 262, 263,  -1, 264, 265, 266, 267,
+             268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
+             282, 283, 284, 285,  -1, 286, 287, 288, 289, 290, 291, 292, 293, 294,
+             295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307,  -1,
+             -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1]
+        ]).to(torch.int64)
+        # fmt: on
+
+        processor_outputs = self.get_processor()(images=self.bus_image_pil)
+        self.assertTrue((processor_outputs["image_patches_indices"] == EXPECTED_IMAGE_PATCH_INPUTS).all())
+
+    def test_fuyu_processing_multiple_image_sample(self):
+        """
+        Test to check processor works with multiple image inputs for a single text input
+        """
+        # fmt: off
+        SINGLE_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
+        SINGLE_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)
+
+        SINGLE_RESIZED_IMAGE_PATCH_INPUTS = torch.Tensor([[ 0,  1,  2, -1,  3,  4,  5, -1,  6,  7,  8, -1,  9, 10, 11, -1, 12, 13, 14, -1, 15, 16, 17, -1, 18, 19, 20, -1, 21, 22, 23, -1, 24, 25, 26, -1, 27, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]])
+        SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[ 71011,  71011,  71011,  71019,  71011,  71011,  71011,  71019,  71011, 71011,  71011,  71019,  71011,  71011,  71011,  71019,  71011,  71011, 71011,  71019,  71011,  71011,  71011,  71019,  71011,  71011,  71011, 71019,  71011,  71011,  71011,  71019,  71011,  71011,  71011,  71019, 71011,  71011,  71011,  71019,      1, 128340,  71374,  71389, 120412, 71377,  71835,  71374,  73615,  71375,  71399,  71435,  71122]])
+        # fmt: on
+
+        # Batch of two images - equally sized
+        images = [self.bus_image_pil, self.bus_image_pil]
+        processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)
+
+        self.assertTrue(
+            (
+                processor_outputs["image_patches_indices"]
+                == torch.cat([SINGLE_IMAGE_PATCH_INPUTS, SINGLE_IMAGE_PATCH_INPUTS], dim=0)
+            ).all()
+        )
+        self.assertTrue(
+            (
+                processor_outputs["input_ids"]
+                == torch.cat([SINGLE_PADDED_UNPACKED_TOKEN_INPUTS, SINGLE_PADDED_UNPACKED_TOKEN_INPUTS], dim=0)
+            ).all()
+        )
+
+        # Processes single images with different sizes as expected
+        images = [self.bus_image_pil]
+        processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
+        self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_IMAGE_PATCH_INPUTS).all())
+        self.assertTrue((processor_outputs["input_ids"] == SINGLE_PADDED_UNPACKED_TOKEN_INPUTS).all())
+
+        images = [self.bus_image_pil.resize((64, 300))]
+        processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
+        self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_RESIZED_IMAGE_PATCH_INPUTS).all())
+        self.assertTrue((processor_outputs["input_ids"] == SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS).all())
+
+        # Batch of two images - different sizes. Left-pads the smaller image inputs
+        images = [self.bus_image_pil, self.bus_image_pil.resize((64, 300))]
+        processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)
+
+        padding_len_patch = SINGLE_IMAGE_PATCH_INPUTS.shape[1] - SINGLE_RESIZED_IMAGE_PATCH_INPUTS.shape[1]
+        padded_single_resized_image_patch = torch.cat(
+            [torch.ones([1, padding_len_patch]) * -1, SINGLE_RESIZED_IMAGE_PATCH_INPUTS], dim=1
+        )
+        expected_image_patch_inputs = torch.cat([SINGLE_IMAGE_PATCH_INPUTS, padded_single_resized_image_patch], dim=0)
+
+        padding_len_token = (
+            SINGLE_PADDED_UNPACKED_TOKEN_INPUTS.shape[1] - SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS.shape[1]
+        )
+        padded_single_resized_padded_unpacked_token_inputs = torch.cat(
+            [torch.zeros([1, padding_len_token]), SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS], dim=1
+        )
+        expected_padded_unpacked_token_inputs = torch.cat(
+            [SINGLE_PADDED_UNPACKED_TOKEN_INPUTS, padded_single_resized_padded_unpacked_token_inputs], dim=0
+        )
+
+        self.assertTrue((processor_outputs["image_patches_indices"] == expected_image_patch_inputs).all())
+        self.assertTrue((processor_outputs["input_ids"] == expected_padded_unpacked_token_inputs).all())
+
+    # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
+
+    @unittest.skip("Fuyu processor does not support image_processor kwargs")
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        pass
+
+    @unittest.skip("Fuyu processor does not support image_processor kwargs")
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        pass
+
+    # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
+
+    # Rewrite as Fuyu image processor does not return pixel values
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    # Rewrite as Fuyu image processor does not return pixel values
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2)
+        # Fuyu uses tokenizer kwargs only when image is None.
+        image_input = None
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 7)
+
+    def test_processor_text_has_no_visual(self):
+        # Overwritten: Fuyu has a complicated processing so we don't check id values
+        processor = self.get_processor()
+
+        text = self.prepare_text_inputs(batch_size=3, modalities="image")
+        image_inputs = self.prepare_image_inputs(batch_size=3)
+        processing_kwargs = {"return_tensors": "pt", "padding": True, "multi_page": True}
+
+        # Call with nested list of vision inputs
+        image_inputs_nested = [[image] if not isinstance(image, list) else image for image in image_inputs]
+        inputs_dict_nested = {"text": text, "images": image_inputs_nested}
+        inputs = processor(**inputs_dict_nested, **processing_kwargs)
+        self.assertTrue(self.text_input_name in inputs)
+
+        # Call with one of the samples with no associated vision input
+        plain_text = "lower newer"
+        image_inputs_nested[0] = []
+        text[0] = plain_text
+        inputs_dict_no_vision = {"text": text, "images": image_inputs_nested}
+        inputs_nested = processor(**inputs_dict_no_vision, **processing_kwargs)
+        self.assertTrue(self.text_input_name in inputs_nested)
+
+
+@require_torch
+class TestImageTextProcessingUtils(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 2
+        self.new_seq_len = 8
+        self.num_sub_sequences = 1
+
+        self.all_bi_tokens_to_place = [4, 6]
+        self.full_unpacked_stream = [torch.tensor([1, 2, 3, 4]), torch.tensor([5, 6, 7, 8, 9, 10])]
+        self.fill_value = 0
+
+        self.num_real_text_tokens = [[3, 2], [2, 4]]
+        # Here the input stream is padded to avoid inconsistencies (current model release matches)
+        self.input_stream = torch.tensor([[[1, 2, 3], [4, 5, 0]], [[6, 7, 0], [8, 9, 10]]])
+        self.image_tokens = [
+            [torch.tensor([1, 2]), torch.tensor([3])],
+            [torch.tensor([4, 5, 6]), torch.tensor([7, 8])],
+        ]
+
+    def test_full_unpacked_stream_to_tensor(self):
+        result = full_unpacked_stream_to_tensor(
+            self.all_bi_tokens_to_place,
+            self.full_unpacked_stream,
+            self.fill_value,
+            self.batch_size,
+            self.new_seq_len,
+            offset=0,
+        )
+        EXPECTED_TENSOR = torch.tensor([[1, 2, 3, 4, 0, 0, 0, 0], [5, 6, 7, 8, 9, 10, 0, 0]])
+        self.assertTrue(torch.equal(result, EXPECTED_TENSOR))
+
+    def test_construct_full_unpacked_stream(self):
+        result = construct_full_unpacked_stream(
+            self.num_real_text_tokens, self.input_stream, self.image_tokens, self.batch_size, self.num_sub_sequences
+        )
+        EXPECTED_UNPACKED_STREAM = [torch.tensor([1, 2, 1, 2, 3]), torch.tensor([4, 5, 6, 6, 7])]
+        for i in range(len(result)):
+            self.assertTrue(torch.equal(result[i], EXPECTED_UNPACKED_STREAM[i]))
+
+
+@require_torch
+class TestProcessImagesForModelInput(unittest.TestCase):
+    def setUp(self):
+        """
+        Adding a mix of present and absent images.
+        """
+
+        self.image_input = torch.randn([1, 1, 3, 64, 64])
+        self.image_present = torch.tensor([[1]])
+        self.image_unpadded_h = torch.tensor([[45]])  # Adjusted for subsequence of 1
+        self.image_unpadded_w = torch.tensor([[50]])  # Adjusted for subsequence of 1
+        self.image_patch_dim_h = 16
+        self.image_patch_dim_w = 16
+        self.image_placeholder_id = 999
+        self.image_newline_id = 888
+        self.variable_sized = True
+        self.image_processor = FuyuImageProcessor(
+            patch_size={"height": self.image_patch_dim_h, "width": self.image_patch_dim_w}
+        )
+
+    def test_process_images_for_model_input_fixed_sized(self):
+        self.variable_sized = False
+        result = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=self.image_input,
+            image_present=self.image_present,
+            image_unpadded_h=self.image_unpadded_h,
+            image_unpadded_w=self.image_unpadded_w,
+            image_placeholder_id=self.image_placeholder_id,
+            image_newline_id=self.image_newline_id,
+            variable_sized=self.variable_sized,
+        )
+        self.assertEqual(result["images"][0][0].shape, torch.Size([3, 64, 64]))