first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/pix2struct/init.py
+++ b/tests/models/pix2struct/init.py
--- a/tests/models/pix2struct/test_image_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_image_processing_pix2struct.py
@@ -0,0 +1,440 @@
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.image_utils import load_image
+from transformers.testing_utils import require_torch, require_torch_accelerator, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_processing_common import url_to_local_path
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+class Pix2StructImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        size=None,
+        do_normalize=True,
+        do_convert_rgb=True,
+        patch_size=None,
+    ):
+        size = size if size is not None else {"height": 20, "width": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.size = size
+        self.do_normalize = do_normalize
+        self.do_convert_rgb = do_convert_rgb
+        self.max_patches = [512, 1024, 2048, 4096]
+        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+
+    def prepare_image_processor_dict(self):
+        return {"do_normalize": self.do_normalize, "do_convert_rgb": self.do_convert_rgb}
+
+    def prepare_dummy_image(self):
+        img_url = url_to_local_path(
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
+        )
+        raw_image = load_image(img_url).convert("RGB")
+        return raw_image
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = Pix2StructImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    @require_vision
+    @require_torch
+    def test_backends_equivalence(self):
+        """Override to use flattened_patches instead of pixel_values."""
+        if len(self.image_processing_classes) < 2:
+            self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
+
+        import io
+
+        import httpx
+        from PIL import Image
+
+        dummy_image = Image.open(
+            io.BytesIO(
+                httpx.get("http://images.cocodataset.org/val2017/000000039769.jpg", follow_redirects=True).content
+            )
+        )
+
+        # Create processors for each backend
+        encodings = {}
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            encodings[backend_name] = image_processor(dummy_image, return_tensors="pt", max_patches=2048)
+
+        # Compare all backends to the first one (reference backend)
+        backend_names = list(encodings.keys())
+        reference_backend = backend_names[0]
+        reference_encoding = encodings[reference_backend].flattened_patches
+        for backend_name in backend_names[1:]:
+            current_encoding = encodings[backend_name].flattened_patches
+            self._assert_tensors_equivalence(reference_encoding, current_encoding)
+
+    @require_vision
+    @require_torch
+    def test_backends_equivalence_batched(self):
+        """Override to use flattened_patches instead of pixel_values."""
+        if len(self.image_processing_classes) < 2:
+            self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+        # Create processors for each backend
+        encodings = {}
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            encodings[backend_name] = image_processor(dummy_images, return_tensors="pt", max_patches=2048)
+
+        # Compare all backends to the first one (reference backend)
+        backend_names = list(encodings.keys())
+        reference_backend = backend_names[0]
+        reference_encoding = encodings[reference_backend].flattened_patches
+        for backend_name in backend_names[1:]:
+            self._assert_tensors_equivalence(reference_encoding, encodings[backend_name].flattened_patches)
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+    def test_expected_patches(self):
+        dummy_image = self.image_processor_tester.prepare_dummy_image()
+
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            max_patch = 2048
+
+            inputs = image_processor(dummy_image, return_tensors="pt", max_patches=max_patch)
+            torch.testing.assert_close(inputs.flattened_patches.mean(), torch.tensor(0.0606), rtol=1e-3, atol=1e-3)
+
+    def test_call_pil(self):
+        for image_processing_class in self.image_processing_classes.values():
+            # Initialize image_processor
+            image_processor = image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            expected_hidden_dim = (
+                (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+                * self.image_processor_tester.num_channels
+            ) + 2
+
+            for max_patch in self.image_processor_tester.max_patches:
+                # Test not batched input
+                encoded_images = image_processor(
+                    image_inputs[0], return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (1, max_patch, expected_hidden_dim),
+                )
+
+                # Test batched
+                encoded_images = image_processor(
+                    image_inputs, return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+                )
+
+    def test_call_vqa(self):
+        for image_processing_class in self.image_processing_classes.values():
+            # Initialize image_processor
+            image_processor = image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            expected_hidden_dim = (
+                (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+                * self.image_processor_tester.num_channels
+            ) + 2
+
+            image_processor.is_vqa = True
+
+            for max_patch in self.image_processor_tester.max_patches:
+                # Test not batched input
+                with self.assertRaises(ValueError):
+                    encoded_images = image_processor(
+                        image_inputs[0], return_tensors="pt", max_patches=max_patch
+                    ).flattened_patches
+
+                dummy_text = "Hello"
+
+                encoded_images = image_processor(
+                    image_inputs[0], return_tensors="pt", max_patches=max_patch, header_text=dummy_text
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (1, max_patch, expected_hidden_dim),
+                )
+
+                # Test batched
+                encoded_images = image_processor(
+                    image_inputs, return_tensors="pt", max_patches=max_patch, header_text=dummy_text
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+                )
+
+    def test_call_numpy(self):
+        for image_processing_class in self.image_processing_classes.values():
+            # Initialize image_processor
+            image_processor = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            expected_hidden_dim = (
+                (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+                * self.image_processor_tester.num_channels
+            ) + 2
+
+            for max_patch in self.image_processor_tester.max_patches:
+                # Test not batched input
+                encoded_images = image_processor(
+                    image_inputs[0], return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (1, max_patch, expected_hidden_dim),
+                )
+
+                # Test batched
+                encoded_images = image_processor(
+                    image_inputs, return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+                )
+
+    def test_call_numpy_4_channels(self):
+        for image_processing_class in self.image_processing_classes.values():
+            # Initialize image_processor
+            image_processor = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            self.image_processor_tester.num_channels = 4
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            expected_hidden_dim = (
+                (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+                * self.image_processor_tester.num_channels
+            ) + 2
+
+            for max_patch in self.image_processor_tester.max_patches:
+                # Test not batched input
+                encoded_images = image_processor(
+                    image_inputs[0], return_tensors="pt", max_patches=max_patch, input_data_format="channels_last"
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (1, max_patch, expected_hidden_dim),
+                )
+
+                # Test batched
+                encoded_images = image_processor(
+                    image_inputs, return_tensors="pt", max_patches=max_patch, input_data_format="channels_last"
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+                )
+            self.image_processor_tester.num_channels = 3
+
+    def test_call_pytorch(self):
+        for image_processing_class in self.image_processing_classes.values():
+            # Initialize image_processor
+            image_processor = image_processing_class(**self.image_processor_dict)
+            # create random PyTorch tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+            # Test not batched input
+            expected_hidden_dim = (
+                (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+                * self.image_processor_tester.num_channels
+            ) + 2
+
+            for max_patch in self.image_processor_tester.max_patches:
+                # Test not batched input
+                encoded_images = image_processor(
+                    image_inputs[0], return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (1, max_patch, expected_hidden_dim),
+                )
+
+                # Test batched
+                encoded_images = image_processor(
+                    image_inputs, return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+                )
+
+    @slow
+    @require_torch_accelerator
+    @require_vision
+    def test_can_compile_torchvision_backend(self):
+        if "torchvision" not in self.image_processing_classes:
+            self.skipTest("Skipping compilation test as torchvision backend is not available")
+
+        torch.compiler.reset()
+        input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
+        image_processor = self.image_processing_classes["torchvision"](**self.image_processor_dict)
+        output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+        image_processor = torch.compile(image_processor, mode="reduce-overhead")
+        output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
+        # Pix2Struct uses flattened_patches instead of pixel_values
+        self._assert_tensors_equivalence(
+            output_eager.flattened_patches, output_compiled.flattened_patches, atol=1e-4, rtol=1e-4, mean_atol=1e-5
+        )
+
+
+@require_torch
+@require_vision
+class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = Pix2StructImageProcessingTester(self, num_channels=4)
+        self.expected_encoded_image_num_channels = 3
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+    def test_call_pil(self):
+        for image_processing_class in self.image_processing_classes.values():
+            # Initialize image_processor
+            image_processor = image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            expected_hidden_dim = (
+                (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+                * (self.image_processor_tester.num_channels - 1)
+            ) + 2
+
+            for max_patch in self.image_processor_tester.max_patches:
+                # Test not batched input
+                encoded_images = image_processor(
+                    image_inputs[0], return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (1, max_patch, expected_hidden_dim),
+                )
+
+                # Test batched
+                encoded_images = image_processor(
+                    image_inputs, return_tensors="pt", max_patches=max_patch
+                ).flattened_patches
+                self.assertEqual(
+                    encoded_images.shape,
+                    (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+                )
+
+    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_numpy(self):
+        return super().test_call_numpy()
+
+    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_pytorch(self):
+        return super().test_call_torch()
+
+    @unittest.skip(
+        reason="Pix2StructImageProcessor does treat numpy and PIL 4 channel images consistently"
+    )  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        return super().test_call_torch()
+
+    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")
+    def test_backends_equivalence(self):
+        pass
+
+    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")
+    def test_backends_equivalence_batched(self):
+        pass
+
+    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")
+    def test_can_compile_torchvision_backend(self):
+        pass
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -0,0 +1,762 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Pix2Struct model."""
+
+import copy
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+import requests
+
+from transformers import Pix2StructConfig, Pix2StructTextConfig, Pix2StructVisionConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        Pix2StructForConditionalGeneration,
+        Pix2StructProcessor,
+        Pix2StructTextModel,
+        Pix2StructVisionModel,
+    )
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class Pix2StructVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=12,
+        patch_embed_hidden_size=12,
+        projection_dim=32,
+        max_patches=64,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_embed_hidden_size = patch_embed_hidden_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.max_patches = max_patches
+        self.seq_length = self.max_patches
+        self.patch_proj_dim = ((patch_size**2) * num_channels) + 2
+
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        flattened_patches = floats_tensor([self.batch_size, self.max_patches, self.patch_proj_dim])
+        config = self.get_config()
+
+        return config, flattened_patches
+
+    def get_config(self):
+        return Pix2StructVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            patch_embed_hidden_size=self.patch_embed_hidden_size,
+        )
+
+    def create_and_check_model(self, config, flattened_patches):
+        model = Pix2StructVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(flattened_patches)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, flattened_patches = config_and_inputs
+        inputs_dict = {
+            "flattened_patches": flattened_patches,
+            "attention_mask": torch.randint(0, 2, (self.batch_size, self.max_patches)),
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Pix2StructVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Pix2Struct does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (Pix2StructVisionModel,) if is_torch_available() else ()
+
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = Pix2StructVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=Pix2StructVisionConfig, has_text_modality=False, hidden_size=32
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Pix2StructVision does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["flattened_patches"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_true(self):
+        pass
+
+    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "google/pix2struct-textcaps-base"
+        model = Pix2StructVisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+class Pix2StructTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=12,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        bos_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.d_kv = hidden_size // num_attention_heads
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return Pix2StructTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+            d_kv=self.d_kv,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = Pix2StructTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Pix2StructTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (Pix2StructTextModel,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = Pix2StructTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Pix2StructTextConfig, hidden_size=32)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_true(self):
+        pass
+
+    @unittest.skip(reason="Pix2Struct does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "google/pix2struct-textcaps-base"
+        model = Pix2StructTextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+class Pix2StructModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.is_training = is_training
+        self.max_patches = self.vision_model_tester.max_patches
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, flattened_patches = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config(text_config, vision_config)
+
+        return config, input_ids, attention_mask, flattened_patches
+
+    def get_config(self, text_config, vision_config):
+        return Pix2StructConfig(
+            text_config=self.text_model_tester.get_config().to_dict(),
+            vision_config=self.vision_model_tester.get_config().to_dict(),
+            projection_dim=64,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, decoder_attention_mask, flattened_patches = config_and_inputs
+
+        attention_mask = (flattened_patches.sum(dim=-1) != 0).float()
+
+        inputs_dict = {
+            "decoder_input_ids": input_ids,
+            "labels": input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "flattened_patches": flattened_patches,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Pix2StructModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {}
+
+    test_resize_embeddings = True
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = Pix2StructModelTester(self)
+
+    def test_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+
+            output = model(**input_dict)
+            self.assertEqual(
+                output[1].shape,
+                (
+                    self.model_tester.vision_model_tester.batch_size,
+                    self.model_tester.text_model_tester.seq_length,
+                    self.model_tester.text_model_tester.vocab_size,
+                ),
+            )
+
+    def test_generative_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config).eval().to(torch_device)
+
+            output = model.generate(**input_dict, use_cache=False, min_new_tokens=10, max_new_tokens=10)
+            output_use_cache = model.generate(**input_dict, use_cache=True, min_new_tokens=10, max_new_tokens=10)
+
+            torch.testing.assert_close(output, output_use_cache)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Pix2StructModel does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip(reason="`Pix2Struct` cannot generate with no inputs provided")
+    def test_generate_without_input_ids(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "flattened_patches",
+                "attention_mask",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+                "encoder_outputs",
+                "past_key_values",
+                "labels",
+                "decoder_inputs_embeds",
+                "use_cache",
+            ]
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            self.skipTest(reason="model_tester.is_training is set to False")
+
+        for model_class in self.all_model_classes[:-1]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
+        if not self.model_tester.is_training:
+            self.skipTest(reason="model_tester.is_training is set to False")
+
+        for model_class in self.all_model_classes[:-1]:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
+
+            model = model_class(config)
+            model.to(torch_device)
+            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+            # hardcode labels to be the same as input_ids
+            inputs["labels"] = inputs["input_ids"]
+
+            loss = model(**inputs).loss
+            loss.backward()
+
+    # overwrite because `vocab_size` is not an attribute of `Pix2StructConfig` but rather `Pix2StructTextConfig`
+    def test_resize_tokens_embeddings(self):
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            self.skipTest(reason="test_resize_embeddings is set to False")
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.text_config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Decoder input ids should be clamped to the maximum size of the vocabulary
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    # overwrite because `vocab_size` is not an attribute of `Pix2StructConfig` but rather `Pix2StructTextConfig`
+    def test_resize_embeddings_untied(self):
+        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            self.skipTest(reason="test_resize_embeddings is set to False")
+
+        original_config.tie_word_embeddings = False
+
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            self.skipTest(reason="Model cannot untie embeddings")
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+            model.eval()
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Decoder input ids should be clamped to the maximum size of the vocabulary
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save Pix2StructConfig and check if we can load Pix2StructVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = Pix2StructVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save Pix2StructConfig and check if we can load Pix2StructTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = Pix2StructTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, prompt_length):
+        # overwrite because # pix2struct seq length depends on image inputs
+        prompt_length = self.model_tester.max_patches
+        encoder_expected_shape = (batch_size, config.num_attention_heads, prompt_length, prompt_length)
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [layer_attentions.shape for layer_attentions in attentions],
+            [encoder_expected_shape] * len(attentions),
+        )
+
+    def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, config, prompt_length):
+        # overwrite because # pix2struct seq length depends on image inputs
+        prompt_length = self.model_tester.max_patches
+        encoder_expected_shape = (batch_size, prompt_length, config.hidden_size)
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [layer_hidden_states.shape for layer_hidden_states in hidden_states],
+            [encoder_expected_shape] * len(hidden_states),
+        )
+
+    @unittest.skip("Pix2Struct has no base model, it was implemented before standardization")
+    def test_model_base_model_prefix(self):
+        pass
+
+
+# We will verify our results on an image of a stop sign
+def prepare_img():
+    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+@require_torch
+@slow
+class Pix2StructIntegrationTest(unittest.TestCase):
+    def test_inference_image_captioning(self):
+        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base").to(torch_device)
+        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        image = prepare_img()
+
+        # image only
+        inputs = processor(images=image, return_tensors="pt").to(torch_device)
+
+        predictions = model.generate(**inputs)
+
+        self.assertEqual(
+            processor.decode(predictions[0], skip_special_tokens=True), "A stop sign is on a street corner."
+        )
+
+    def test_batched_inference_image_captioning(self):
+        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base").to(torch_device)
+        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        image_1 = prepare_img()
+
+        second_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg"
+        image_2 = Image.open(requests.get(second_url, stream=True).raw)
+
+        # image only
+        inputs = processor(images=[image_1, image_2], return_tensors="pt").to(torch_device)
+
+        predictions = model.generate(**inputs)
+
+        self.assertEqual(
+            processor.decode(predictions[0], skip_special_tokens=True), "A stop sign is on a street corner."
+        )
+
+        self.assertEqual(
+            processor.decode(predictions[1], skip_special_tokens=True),
+            "A row of books including The Temple Bar and Guiness.",
+        )
+
+    def test_batched_inference_image_captioning_conditioned(self):
+        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base").to(torch_device)
+        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
+        image_1 = prepare_img()
+
+        second_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg"
+        image_2 = Image.open(requests.get(second_url, stream=True).raw)
+        texts = ["A picture of", "An photography of"]
+
+        # image only
+        inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", add_special_tokens=False).to(
+            torch_device
+        )
+
+        predictions = model.generate(**inputs)
+
+        self.assertEqual(
+            processor.decode(predictions[0], skip_special_tokens=True),
+            "A picture of a stop sign with a red stop sign",
+        )
+
+        self.assertEqual(
+            processor.decode(predictions[1], skip_special_tokens=True),
+            "An photography of the Temple Bar and other places in the city.",
+        )
+
+    def test_vqa_model(self):
+        model_id = "google/pix2struct-ai2d-base"
+
+        image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(image_url, stream=True).raw)
+
+        model = Pix2StructForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+        processor = Pix2StructProcessor.from_pretrained(model_id)
+
+        # image only
+        text = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
+
+        inputs = processor(images=image, return_tensors="pt", text=text).to(torch_device, torch.bfloat16)
+
+        predictions = model.generate(**inputs)
+        self.assertEqual(processor.decode(predictions[0], skip_special_tokens=True), "ash cloud")
+
+    def test_vqa_model_batched(self):
+        model_id = "google/pix2struct-ai2d-base"
+
+        image_urls = [
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo-2.png",
+        ]
+
+        images = [Image.open(requests.get(image_url, stream=True).raw) for image_url in image_urls]
+
+        texts = [
+            "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
+            "What is the producer in the diagram? (1) Phytoplankton (2) Zooplankton (3) Large fish (4) Small fish",
+        ]
+
+        model = Pix2StructForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+        processor = Pix2StructProcessor.from_pretrained(model_id)
+
+        inputs = processor(images=images, return_tensors="pt", text=texts).to(torch_device, torch.bfloat16)
+
+        predictions = model.generate(**inputs)
+        self.assertEqual(processor.decode(predictions[0], skip_special_tokens=True), "ash cloud")
+        self.assertEqual(processor.decode(predictions[1], skip_special_tokens=True), "Phytoplankton")
--- a/tests/models/pix2struct/test_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_processing_pix2struct.py
@@ -0,0 +1,206 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import (
+        Pix2StructProcessor,
+    )
+
+
+@require_vision
+@require_torch
+class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Pix2StructProcessor
+    text_input_name = "decoder_input_ids"
+    images_input_name = "flattened_patches"
+
+    @classmethod
+    def _setup_tokenizer(cls):
+        tokenizer_class = cls._get_component_class_from_processor("tokenizer")
+        return tokenizer_class.from_pretrained("google-t5/t5-small")
+
+    def test_processor_max_patches(self):
+        processor = self.get_processor()
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        max_patches = [512, 1024, 2048, 4096]
+        expected_hidden_size = [770, 770, 770, 770]
+        # with text
+        for i, max_patch in enumerate(max_patches):
+            inputs = processor(text=input_str, images=image_input, max_patches=max_patch)
+            self.assertEqual(inputs["flattened_patches"][0].shape[0], max_patch)
+            self.assertEqual(inputs["flattened_patches"][0].shape[1], expected_hidden_size[i])
+
+        # without text input
+        for i, max_patch in enumerate(max_patches):
+            inputs = processor(images=image_input, max_patches=max_patch)
+            self.assertEqual(inputs["flattened_patches"][0].shape[0], max_patch)
+            self.assertEqual(inputs["flattened_patches"][0].shape[1], expected_hidden_size[i])
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
+        print("image_processor", image_processor)
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["flattened_patches"][0][0]), 194)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", max_patches=4096)
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, max_patches=1024)
+        self.assertEqual(len(inputs["flattened_patches"][0]), 1024)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            max_patches=1024,
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            max_patches=1024,
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
+
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 5)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"max_patches": 1024},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
+
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.get_attributes():
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"max_patches": 1024},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
+
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 76)