first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/pipelines/init.py
+++ b/tests/pipelines/init.py
--- a/tests/pipelines/test_pipelines_any_to_any.py
+++ b/tests/pipelines/test_pipelines_any_to_any.py
@@ -0,0 +1,472 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+import numpy as np
+
+from transformers import MODEL_FOR_MULTIMODAL_LM_MAPPING, is_vision_available
+from transformers.pipelines import AnyToAnyPipeline, pipeline
+from transformers.testing_utils import (
+    Expectations,
+    is_pipeline_test,
+    require_librosa,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+sys.path.append(".")
+from utils.fetch_hub_objects_for_ci import url_to_local_path
+
+
+if is_vision_available():
+    import PIL
+
+
+@is_pipeline_test
+@require_vision
+@require_librosa
+@require_torch
+class AnyToAnyPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_MULTIMODAL_LM_MAPPING
+
+    # We only need `processor` but the Mixin will pass all possible preprocessing classes for a model.
+    # So we add them all in signature
+    def get_test_pipeline(
+        self, model, tokenizer, processor, image_processor=None, feature_extractor=None, dtype="float32"
+    ):
+        _is_images_supported = hasattr(processor, "image_processor")
+        _is_videos_supported = hasattr(processor, "video_processor")
+        _is_audios_supported = hasattr(processor, "feature_extractor")
+
+        image_token = getattr(processor.tokenizer, "image_token", "")
+        video_token = getattr(processor.tokenizer, "video_token", "")
+        audio_token = getattr(processor.tokenizer, "audio_token", "")
+
+        images_examples = [
+            {
+                "images": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "text": f"{image_token}This is a ",
+            },
+            {
+                "images": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "text": f"{image_token}Here I see a ",
+            },
+        ]
+
+        videos_examples = [
+            {
+                "videos": url_to_local_path(
+                    "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
+                ),
+                "text": f"{video_token}This video shows a ",
+            },
+            {
+                "video": url_to_local_path(
+                    "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4"
+                ),
+                "text": f"{video_token}In the video I see a ",
+            },
+        ]
+
+        audio_examples = [
+            {
+                "audio": url_to_local_path(
+                    "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/glass-breaking-151256.mp3"
+                ),
+                "text": f"{audio_token}This is sound of a ",
+            },
+            {
+                "audio": url_to_local_path(
+                    "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav"
+                ),
+                "text": f"{audio_token}Here I hear a ",
+            },
+        ]
+
+        examples = []
+        if _is_images_supported:
+            examples.extend(images_examples)
+        if _is_videos_supported:
+            examples.extend(videos_examples)
+        if _is_audios_supported:
+            examples.extend(audio_examples)
+
+        pipe = AnyToAnyPipeline(model=model, processor=processor, dtype=dtype, max_new_tokens=10)
+
+        return pipe, examples
+
+    def run_pipeline_test(self, pipe, examples):
+        # Single
+        outputs = pipe(examples[0])
+        self.assertEqual(
+            outputs,
+            [
+                {"input_text": ANY(str), "generated_text": ANY(str)},
+            ],
+        )
+
+        # Batched but limit to last 2 examples
+        outputs = pipe(examples[:2])
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {"input_text": ANY(str), "generated_text": ANY(str)},
+                ],
+                [
+                    {"input_text": ANY(str), "generated_text": ANY(str)},
+                ],
+            ],
+        )
+
+        # `generation_mode` raises errors when dosn't match with other params
+        with self.assertRaises(ValueError):
+            pipe(examples, generation_mode="video")
+
+        with self.assertRaises(ValueError):
+            pipe(examples, generation_mode="audio", return_full_text=True)
+
+        with self.assertRaises(ValueError):
+            pipe(examples, generation_mode="image", return_type=1)
+
+        # Chat template
+        if getattr(pipe.processor, "chat_template", None) is not None:
+            messages = []
+            for example in examples[:2]:
+                example.pop("text")
+                modality_type, modality_data = list(example.items())[0]
+                message = {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "This is a "},
+                        {"type": modality_type, "path": modality_data},
+                    ],
+                }
+                messages.append([message])
+            outputs = pipe(messages, return_full_text=True, max_new_tokens=10)
+
+            self.assertEqual(
+                outputs,
+                [
+                    [
+                        {"input_text": ANY(str), "generated_text": ANY(str)},
+                    ],
+                    [
+                        {"input_text": ANY(str), "generated_text": ANY(str)},
+                    ],
+                ],
+            )
+
+    @slow
+    def test_small_model_pt_token_text_only(self):
+        pipe = pipeline("any-to-any", model="google/gemma-3n-E4B-it")
+        text = "What is the capital of France? Assistant:"
+
+        outputs = pipe(text=text, generate_kwargs={"do_sample": False})
+        EXPECTED_OUTPUT = Expectations(
+            {
+                ("cuda", 8): [
+                    {
+                        "input_text": "What is the capital of France? Assistant:",
+                        "generated_text": "What is the capital of France? Assistant: The capital of France is Paris.",
+                    }
+                ],
+                ("rocm", (9, 4)): [
+                    {
+                        "input_text": "What is the capital of France? Assistant:",
+                        "generated_text": "What is the capital of France? Assistant: The capital of France is Paris.\n",
+                    }
+                ],
+            }
+        ).get_expectation()
+        self.assertEqual(outputs, EXPECTED_OUTPUT)
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Write a poem on Hugging Face, the company"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is the capital of France?"},
+                    ],
+                },
+            ],
+        ]
+        outputs = pipe(text=messages, generate_kwargs={"do_sample": False})
+        EXPECTED_OUTPUT = Expectations(
+            {
+                ("cuda", 8): [
+                    [
+                        {
+                            "input_text": [
+                                {
+                                    "role": "user",
+                                    "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                                }
+                            ],
+                            "generated_text": [
+                                {
+                                    "role": "user",
+                                    "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                                },
+                                {
+                                    "role": "assistant",
+                                    "content": "A digital embrace, a friendly face,Hugging Face, a vibrant space.Where models bloom and knowledge flows,And AI's potential brightly glows.From transformers deep, a powerful core,To datasets vast, and so much more.A community thrives, a helping hand,Sharing insights across the land.Pipelines built with elegant ease,For NLP tasks, designed to please.Fine-tuning models, a joyful art,To tailor AI to play its part.Spaces open wide, for demos bright,Showcasing wonders, day and night.From text to image, code to sound,Innovation's fertile ground.A platform built on open grace,Democratizing AI's embrace.For researchers, builders, and all who seek,To unlock the future, bold and sleek.So raise a glass to the Face so kind,Hugging Face, expanding the mind.Connecting minds, with code and care,A future of AI, beyond compare.",
+                                },
+                            ],
+                        }
+                    ],
+                    [
+                        {
+                            "input_text": [
+                                {
+                                    "role": "user",
+                                    "content": [{"type": "text", "text": "What is the capital of France?"}],
+                                }
+                            ],
+                            "generated_text": [
+                                {
+                                    "role": "user",
+                                    "content": [{"type": "text", "text": "What is the capital of France?"}],
+                                },
+                                {"role": "assistant", "content": "The capital of France is **Paris**. "},
+                            ],
+                        }
+                    ],
+                ],
+                ("rocm", (9, 4)): [
+                    [
+                        {
+                            "input_text": [
+                                {
+                                    "role": "user",
+                                    "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                                }
+                            ],
+                            "generated_text": [
+                                {
+                                    "role": "user",
+                                    "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                                },
+                                {
+                                    "role": "assistant",
+                                    "content": "A digital embrace, a friendly face,\nHugging Face, a vibrant space.\nWhere models bloom and knowledge flows,\nAnd AI's potential brightly glows.\n\nFrom transformers deep, a powerful core,\nTo datasets vast, and so much more.\nA community thrives, a helping hand,\nSharing insights across the land.\n\nPipelines built with elegant ease,\nFor NLP tasks, designed to please.\nFine-tuning models, a joyful art,\nTo tailor AI to play its part.\n\nSpaces open wide, for demos bright,\nShowcasing wonders, day and night.\nFrom text to image, code to sound,\nInnovation's fertile ground.\n\nA platform built on open grace,\nDemocratizing AI's embrace.\nFor researchers, builders, and all who seek,\nTo unlock the future, bold and sleek.\n\nSo raise a glass to the Face so kind,\nHugging Face, expanding the mind.\nConnecting minds, with code and care,\nA future of AI, beyond compare.\n\n\n\n",
+                                },
+                            ],
+                        }
+                    ],
+                    [
+                        {
+                            "input_text": [
+                                {
+                                    "role": "user",
+                                    "content": [{"type": "text", "text": "What is the capital of France?"}],
+                                }
+                            ],
+                            "generated_text": [
+                                {
+                                    "role": "user",
+                                    "content": [{"type": "text", "text": "What is the capital of France?"}],
+                                },
+                                {"role": "assistant", "content": "The capital of France is **Paris**. \n"},
+                            ],
+                        }
+                    ],
+                ],
+            }
+        ).get_expectation()
+        self.assertEqual(outputs, EXPECTED_OUTPUT)
+
+    @slow
+    @require_torch
+    def test_small_model_pt_chat_with_response_parsing(self):
+        pipe = pipeline("any-to-any", model="google/gemma-3n-E4B-it")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is the capital of France?"},
+                ],
+            },
+        ]
+        pipe.tokenizer.response_schema = {
+            # A real response schema should probably have things like "role" and "content"
+            # and "reasoning_content" but it's unlikely we'd get a tiny model to reliably
+            # output anything like that, so let's keep it simple.
+            "type": "object",
+            "properties": {
+                "first_word": {"type": "string", "x-regex": r"^\s*([a-zA-Z]+)"},
+                "last_word": {"type": "string", "x-regex": r"([a-zA-Z]+)\s*$"},
+            },
+        }
+        outputs = pipe(text=messages, generate_kwargs={"do_sample": False})
+        parsed_message = outputs[0]["generated_text"][-1]
+        # The parsed message should be a dict with the schema keys, not {"role": "assistant", "content": ...}
+        self.assertIn("first_word", parsed_message)
+        self.assertIn("last_word", parsed_message)
+        self.assertNotIn("role", parsed_message)
+        self.assertIsInstance(parsed_message["first_word"], str)
+        self.assertIsInstance(parsed_message["last_word"], str)
+
+    @slow
+    def test_small_model_pt_token_audio_input(self):
+        pipe = pipeline("any-to-any", model="google/gemma-3n-E4B-it")
+
+        audio_path = url_to_local_path(
+            "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav"
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What do you hear in this audio?"},
+                    {"type": "audio", "url": audio_path},
+                ],
+            },
+        ]
+        outputs = pipe(text=messages, return_type=1, generate_kwargs={"do_sample": False})  # return new text
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What do you hear in this audio?"},
+                                {
+                                    "type": "audio",
+                                    "url": "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav",
+                                },
+                            ],
+                        }
+                    ],
+                    "generated_None": "user\nWhat do you hear in this audio?\n\n\n\n\nmodel\nThe audio contains the repeated sound of someone **coughing**. It's a fairly consistent, forceful cough throughout the duration.",
+                }
+            ],
+        )
+
+    @slow
+    def test_small_model_pt_token_audio_gen(self):
+        pipe = pipeline("any-to-any", model="Qwen/Qwen2.5-Omni-3B", dtype="bfloat16")
+
+        video_path = url_to_local_path(
+            "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Cooking_cake.mp4"
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this video."},
+                    {"type": "video", "video": video_path},
+                ],
+            },
+        ]
+        outputs = pipe(
+            text=messages,
+            num_frames=16,
+            max_new_tokens=50,
+            load_audio_from_video=True,
+            generate_kwargs={"use_audio_in_video": True, "talker_do_sample": False, "do_sample": False},
+        )
+        EXPECTED_CONTENT = Expectations(
+            {
+                (
+                    "cuda",
+                    8,
+                ): "system\nYou are a helpful assistant.\nuser\nDescribe this video.\nassistant\nThe video begins with a man standing in a kitchen, wearing a black shirt. He is holding a large glass bowl filled with flour and a spoon. The man starts to mix the flour in the bowl, creating a dough. As he mixes, he continues to talk to the camera, explaining the process. The kitchen has wooden cabinets and a white refrigerator in the background. The man's movements are deliberate and focused as he works with the dough. The video ends with the man still mixing the dough in the bowl. Overall, the video provides a clear and detailed demonstration of how to make dough using flour and a spoon.",
+                (
+                    "rocm",
+                    (9, 4),
+                ): "The video begins with a man standing in a kitchen, wearing a black shirt. He is positioned in front of a refrigerator and wooden cabinets. The man is speaking and gesturing with his hands, possibly explaining something or giving instructions. The kitchen appears to be well-lit and has a clean, organized appearance.\n\nAs the video progresses, the man continues to speak and gesture, maintaining his position in the kitchen. The camera remains focused on him, capturing his upper body and face. The background remains consistent, showing the refrigerator and wooden cabinets. The lighting in the kitchen stays bright, and the overall atmosphere remains calm and focused.\n\nThroughout the video, the man's movements and expressions suggest that he is engaged in a conversation or presentation. His gestures and facial expressions indicate that he is actively communicating and possibly demonstrating something related to cooking or food preparation. The kitchen setting provides a practical and relatable backdrop for his actions and words.\n\nOverall, the video depicts a man in a kitchen, speaking and gesturing while possibly explaining or demonstrating something related to cooking or food preparation. The kitchen setting, with its well-lit and organized appearance, serves as a suitable environment for his actions and words.",
+            }
+        ).get_expectation()
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "Describe this video."},
+                                {
+                                    "type": "video",
+                                    "video": "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Cooking_cake.mp4",
+                                },
+                            ],
+                        }
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "Describe this video."},
+                                {
+                                    "type": "video",
+                                    "video": "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Cooking_cake.mp4",
+                                },
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": EXPECTED_CONTENT,
+                        },
+                    ],
+                }
+            ],
+        )
+
+        outputs = pipe(text=messages, generation_mode="audio", num_frames=16, max_new_tokens=20)
+
+        self.assertEqual(len(outputs), len(messages))
+        self.assertIsInstance(outputs[0], dict)
+        for out in outputs:
+            self.assertTrue("input_text" in out)
+            self.assertTrue("generated_audio" in out)
+            self.assertIsInstance(out["generated_audio"], np.ndarray)
+
+    @slow
+    def test_small_model_pt_image_gen(self):
+        pipe = pipeline("any-to-any", model="deepseek-community/Janus-Pro-1B")
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "A dog running under the rain."},
+                ],
+            },
+        ]
+        outputs = pipe(text=messages, generation_mode="image")
+
+        self.assertEqual(len(outputs), len(messages))
+        self.assertIsInstance(outputs[0], dict)
+        for out in outputs:
+            self.assertTrue("input_text" in out)
+            self.assertTrue("generated_image" in out)
+            self.assertIsInstance(out["generated_image"], PIL.Image.Image)
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -0,0 +1,248 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import datasets
+import numpy as np
+from huggingface_hub import AudioClassificationOutputElement
+
+from transformers import (
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
+    is_torch_available,
+)
+from transformers.pipelines import AudioClassificationPipeline, pipeline
+from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
+    is_pipeline_test,
+    nested_simplify,
+    require_torch,
+    require_torchaudio,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_torch_available():
+    import torch
+
+
+@is_pipeline_test
+class AudioClassificationPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+            )
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        audio_classifier = AudioClassificationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+
+        # test with a raw waveform
+        audio = np.zeros((34000,))
+        audio2 = np.zeros((14000,))
+        return audio_classifier, [audio2, audio]
+
+    def run_pipeline_test(self, audio_classifier, examples):
+        audio2, audio = examples
+        output = audio_classifier(audio)
+        # by default a model is initialized with num_labels=2
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+        output = audio_classifier(audio, top_k=1)
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+
+        self.run_torchaudio(audio_classifier)
+
+        for single_output in output:
+            compare_pipeline_output_to_hub_spec(single_output, AudioClassificationOutputElement)
+
+    @require_torchaudio
+    def run_torchaudio(self, audio_classifier):
+        self._load_dataset()
+        # test with a local file
+        audio = self._dataset[0]["audio"]["array"]
+        output = audio_classifier(audio)
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        model = "anton-l/wav2vec2-random-tiny-classifier"
+
+        audio_classifier = pipeline("audio-classification", model=model)
+
+        audio = np.ones((8000,))
+        output = audio_classifier(audio, top_k=4)
+
+        EXPECTED_OUTPUT = [
+            {"score": 0.0842, "label": "no"},
+            {"score": 0.0838, "label": "up"},
+            {"score": 0.0837, "label": "go"},
+            {"score": 0.0834, "label": "right"},
+        ]
+        EXPECTED_OUTPUT_PT_2 = [
+            {"score": 0.0845, "label": "stop"},
+            {"score": 0.0844, "label": "on"},
+            {"score": 0.0841, "label": "right"},
+            {"score": 0.0834, "label": "left"},
+        ]
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
+
+        audio_dict = {"array": np.ones((8000,)), "sampling_rate": audio_classifier.feature_extractor.sampling_rate}
+        output = audio_classifier(audio_dict, top_k=4)
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
+
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        model = "anton-l/wav2vec2-random-tiny-classifier"
+
+        audio_classifier = pipeline("audio-classification", model=model, dtype=torch.float16)
+
+        audio = np.ones((8000,))
+        output = audio_classifier(audio, top_k=4)
+
+        # Expected outputs are collected running the test on torch 2.6 in few scenarios.
+        # Running on CUDA T4/A100 and on XPU PVC:
+        EXPECTED_OUTPUT = [
+            {"score": 0.0833, "label": "go"},
+            {"score": 0.0833, "label": "off"},
+            {"score": 0.0833, "label": "stop"},
+            {"score": 0.0833, "label": "on"},
+        ]
+        # Running on CPU:
+        EXPECTED_OUTPUT_PT_2 = [
+            {"score": 0.0839, "label": "no"},
+            {"score": 0.0837, "label": "go"},
+            {"score": 0.0836, "label": "yes"},
+            {"score": 0.0835, "label": "right"},
+        ]
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
+
+        audio_dict = {"array": np.ones((8000,)), "sampling_rate": audio_classifier.feature_extractor.sampling_rate}
+        output = audio_classifier(audio_dict, top_k=4)
+        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
+
+    @require_torch
+    @slow
+    def test_large_model_pt(self):
+        model = "superb/wav2vec2-base-superb-ks"
+
+        audio_classifier = pipeline("audio-classification", model=model)
+        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test")
+
+        audio = np.array(dataset[3]["speech"], dtype=np.float32)
+        output = audio_classifier(audio, top_k=4)
+        self.assertEqual(
+            nested_simplify(output, decimals=3),
+            [
+                {"score": 0.981, "label": "go"},
+                {"score": 0.007, "label": "up"},
+                {"score": 0.006, "label": "_unknown_"},
+                {"score": 0.001, "label": "down"},
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_top_k_none_returns_all_labels(self):
+        model_name = "superb/wav2vec2-base-superb-ks"  # model with more than 5 labels
+        classification_pipeline = pipeline(
+            "audio-classification",
+            model=model_name,
+            top_k=None,
+        )
+
+        # Create dummy input
+        sampling_rate = 16000
+        signal = np.zeros((sampling_rate,), dtype=np.float32)
+
+        result = classification_pipeline(signal)
+        num_labels = classification_pipeline.model.config.num_labels
+
+        self.assertEqual(len(result), num_labels, "Should return all labels when top_k is None")
+
+    @require_torch
+    @slow
+    def test_top_k_none_with_few_labels(self):
+        model_name = "superb/hubert-base-superb-er"  # model with fewer labels
+        classification_pipeline = pipeline(
+            "audio-classification",
+            model=model_name,
+            top_k=None,
+        )
+
+        # Create dummy input
+        sampling_rate = 16000
+        signal = np.zeros((sampling_rate,), dtype=np.float32)
+
+        result = classification_pipeline(signal)
+        num_labels = classification_pipeline.model.config.num_labels
+
+        self.assertEqual(len(result), num_labels, "Should handle models with fewer labels correctly")
+
+    @require_torch
+    @slow
+    def test_top_k_greater_than_labels(self):
+        model_name = "superb/hubert-base-superb-er"
+        classification_pipeline = pipeline(
+            "audio-classification",
+            model=model_name,
+            top_k=100,  # intentionally large number
+        )
+
+        # Create dummy input
+        sampling_rate = 16000
+        signal = np.zeros((sampling_rate,), dtype=np.float32)
+
+        result = classification_pipeline(signal)
+        num_labels = classification_pipeline.model.config.num_labels
+
+        self.assertEqual(len(result), num_labels, "Should cap top_k to number of labels")
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -0,0 +1,982 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import logging
+import os
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+import datasets
+from huggingface_hub import delete_repo, snapshot_download
+from huggingface_hub.errors import HfHubHTTPError
+
+from transformers import (
+    AutomaticSpeechRecognitionPipeline,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DistilBertForSequenceClassification,
+    MaskGenerationPipeline,
+    TextClassificationPipeline,
+    TextGenerationPipeline,
+    pipeline,
+)
+from transformers.pipelines import PIPELINE_REGISTRY, get_task
+from transformers.pipelines.base import Pipeline, _pad
+from transformers.testing_utils import (
+    TOKEN,
+    USER,
+    CaptureLogger,
+    RequestCounter,
+    backend_empty_cache,
+    is_pipeline_test,
+    is_staging_test,
+    nested_simplify,
+    require_peft,
+    require_torch,
+    require_torch_accelerator,
+    require_torch_multi_accelerator,
+    slow,
+    torch_device,
+)
+from transformers.utils import direct_transformers_import, is_torch_available
+from transformers.utils import logging as transformers_logging
+
+
+sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
+
+from test_module.custom_pipeline import PairClassificationPipeline  # noqa E402
+
+
+logger = logging.getLogger(__name__)
+
+
+PATH_TO_TRANSFORMERS = os.path.join(Path(__file__).parent.parent.parent, "src/transformers")
+
+
+# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
+transformers_module = direct_transformers_import(PATH_TO_TRANSFORMERS)
+
+
+class ANY:
+    def __init__(self, *_types):
+        self._types = _types
+
+    def __eq__(self, other):
+        return isinstance(other, self._types)
+
+    def __repr__(self):
+        return f"ANY({', '.join(_type.__name__ for _type in self._types)})"
+
+
+@is_pipeline_test
+class CommonPipelineTest(unittest.TestCase):
+    @require_torch
+    def test_pipeline_iteration(self):
+        from torch.utils.data import Dataset
+
+        class MyDataset(Dataset):
+            data = [
+                "This is a test",
+                "This restaurant is great",
+                "This restaurant is awful",
+            ]
+
+            def __len__(self):
+                return 3
+
+            def __getitem__(self, i):
+                return self.data[i]
+
+        text_classifier = pipeline(task="text-classification", model="hf-internal-testing/tiny-random-distilbert")
+        dataset = MyDataset()
+        for output in text_classifier(dataset):
+            self.assertEqual(output, {"label": ANY(str), "score": ANY(float)})
+
+    @require_torch
+    def test_check_task_auto_inference(self):
+        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
+
+        self.assertIsInstance(pipe, TextClassificationPipeline)
+
+    @require_torch
+    def test_pipeline_batch_size_global(self):
+        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
+        self.assertEqual(pipe._batch_size, None)
+        self.assertEqual(pipe._num_workers, None)
+
+        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert", batch_size=2, num_workers=1)
+        self.assertEqual(pipe._batch_size, 2)
+        self.assertEqual(pipe._num_workers, 1)
+
+    @require_torch
+    def test_pipeline_pathlike(self):
+        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
+        with tempfile.TemporaryDirectory() as d:
+            pipe.save_pretrained(d)
+            path = Path(d)
+            newpipe = pipeline(task="text-classification", model=path)
+        self.assertIsInstance(newpipe, TextClassificationPipeline)
+
+    @require_torch
+    def test_pipeline_override(self):
+        class MyPipeline(TextClassificationPipeline):
+            pass
+
+        text_classifier = pipeline(model="hf-internal-testing/tiny-random-distilbert", pipeline_class=MyPipeline)
+
+        self.assertIsInstance(text_classifier, MyPipeline)
+
+    @require_torch
+    def test_pipeline_tokenizer_tuple_respects_use_fast_override(self):
+        text_classifier = pipeline(
+            task="text-classification",
+            model="hf-internal-testing/tiny-random-bert",
+            tokenizer=("hf-internal-testing/tiny-random-bert", {"use_fast": False}),
+        )
+
+        self.assertIsInstance(text_classifier, TextClassificationPipeline)
+        self.assertEqual(type(text_classifier.tokenizer).__name__, "BertTokenizer")
+
+    def test_check_task(self):
+        task = get_task("openai-community/gpt2")
+        self.assertEqual(task, "text-generation")
+
+        with self.assertRaises(RuntimeError):
+            # Wrong framework
+            get_task("espnet/siddhana_slurp_entity_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best")
+
+    @require_torch
+    def test_iterator_data(self):
+        def data(n: int):
+            for _ in range(n):
+                yield "This is a test"
+
+        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
+
+        results = []
+        for out in pipe(data(10)):
+            self.assertEqual(nested_simplify(out), {"label": "LABEL_0", "score": 0.504})
+            results.append(out)
+        self.assertEqual(len(results), 10)
+
+        # When using multiple workers on streamable data it should still work
+        # This will force using `num_workers=1` with a warning for now.
+        results = []
+        for out in pipe(data(10), num_workers=2):
+            self.assertEqual(nested_simplify(out), {"label": "LABEL_0", "score": 0.504})
+            results.append(out)
+        self.assertEqual(len(results), 10)
+
+    @require_torch
+    def test_unbatch_attentions_hidden_states(self):
+        model = DistilBertForSequenceClassification.from_pretrained(
+            "hf-internal-testing/tiny-random-distilbert", output_hidden_states=True, output_attentions=True
+        )
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-distilbert")
+        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
+
+        # Used to throw an error because `hidden_states` are a tuple of tensors
+        # instead of the expected tensor.
+        outputs = text_classifier(["This is great !"] * 20, batch_size=32)
+        self.assertEqual(len(outputs), 20)
+
+    @require_torch
+    def test_dtype_property(self):
+        import torch
+
+        model_id = "hf-internal-testing/tiny-random-distilbert"
+
+        # If dtype is specified in the pipeline constructor, the property should return that type
+        pipe = pipeline(model=model_id, dtype=torch.float16)
+        self.assertEqual(pipe.dtype, torch.float16)
+
+        # If the underlying model changes dtype, the property should return the new type
+        pipe.model.to(torch.bfloat16)
+        self.assertEqual(pipe.dtype, torch.bfloat16)
+
+        # If dtype is NOT specified in the pipeline constructor, the property should just return
+        # the dtype of the underlying model (default)
+        pipe = pipeline(model=model_id)
+        self.assertEqual(pipe.dtype, torch.float32)
+
+        # If underlying model doesn't have dtype property, simply return None
+        pipe.model = None
+        self.assertIsNone(pipe.dtype)
+
+    @require_torch
+    def test_auto_model_pipeline_registration_from_local_dir(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            snapshot_download("hf-internal-testing/tiny-random-custom-architecture", local_dir=tmp_dir)
+            pipe = pipeline("text-generation", tmp_dir, trust_remote_code=True)
+
+            self.assertIsInstance(pipe, TextGenerationPipeline)  # Assert successful load
+
+    @require_peft
+    @require_torch
+    def test_pipeline_from_local_with_embedded_adapter(self):
+        """
+        Test for issue #43746: Only overwrite the pretrained_model_name_or_path if needed with adapter.
+
+        This test ensures that when a pipeline loads from a local directory that contains a base model
+        with an embedded adapter (i.e., it has a config.json file), the path should NOT be overwritten
+        with the base_model_name_or_path from the adapter config. The fix is applied in
+        src/transformers/pipelines/__init__.py in the pipeline function.
+        """
+        peft_test_model = "peft-internal-testing/tiny-OPTForCausalLM-lora"
+        transformers_test_model = "hf-internal-testing/tiny-random-OPTForCausalLM"
+
+        # Create a temporary directory with a complete adapter model structure
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_dir = Path(tmp_dir)
+
+            # Save the model and adapter locally
+            from transformers import AutoConfig, AutoModel
+
+            config = AutoConfig.from_pretrained(transformers_test_model)
+            model = AutoModel.from_pretrained(transformers_test_model)
+            adapter_model = AutoModel.from_pretrained(peft_test_model)
+            config.save_pretrained(tmp_dir)
+            model.save_pretrained(tmp_dir)
+            adapter_model.save_pretrained(tmp_dir)
+
+            # Overwrite the base_model_name_or_path to an invalid value that
+            # would cause the pipeline load to fail later
+            import json
+
+            from transformers.utils import ADAPTER_CONFIG_NAME
+
+            adapter_config_path = tmp_dir / ADAPTER_CONFIG_NAME
+            with open(adapter_config_path, "r") as handle:
+                adapter_config = json.load(handle)
+            adapter_config["base_model_name_or_path"] = "some/model/that/does/not/exist"
+            with open(adapter_config_path, "w") as handle:
+                json.dump(adapter_config, handle)
+
+            # Load from the saved path and make sure it actually loads despite
+            # the invalid adapter config path
+            pipeline("text-generation", tmp_dir)
+
+
+@is_pipeline_test
+@require_torch
+class PipelineScikitCompatTest(unittest.TestCase):
+    def test_pipeline_predict(self):
+        data = ["This is a test"]
+
+        text_classifier = pipeline(task="text-classification", model="hf-internal-testing/tiny-random-distilbert")
+
+        expected_output = [{"label": ANY(str), "score": ANY(float)}]
+        actual_output = text_classifier.predict(data)
+        self.assertEqual(expected_output, actual_output)
+
+    def test_pipeline_transform(self):
+        data = ["This is a test"]
+
+        text_classifier = pipeline(task="text-classification", model="hf-internal-testing/tiny-random-distilbert")
+
+        expected_output = [{"label": ANY(str), "score": ANY(float)}]
+        actual_output = text_classifier.transform(data)
+        self.assertEqual(expected_output, actual_output)
+
+
+@is_pipeline_test
+class PipelinePadTest(unittest.TestCase):
+    @require_torch
+    def test_pipeline_padding(self):
+        import torch
+
+        items = [
+            {
+                "label": "label1",
+                "input_ids": torch.LongTensor([[1, 23, 24, 2]]),
+                "attention_mask": torch.LongTensor([[0, 1, 1, 0]]),
+            },
+            {
+                "label": "label2",
+                "input_ids": torch.LongTensor([[1, 23, 24, 43, 44, 2]]),
+                "attention_mask": torch.LongTensor([[0, 1, 1, 1, 1, 0]]),
+            },
+        ]
+
+        self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
+        self.assertTrue(
+            torch.allclose(
+                _pad(items, "input_ids", 10, "right"),
+                torch.LongTensor([[1, 23, 24, 2, 10, 10], [1, 23, 24, 43, 44, 2]]),
+            )
+        )
+        self.assertTrue(
+            torch.allclose(
+                _pad(items, "input_ids", 10, "left"),
+                torch.LongTensor([[10, 10, 1, 23, 24, 2], [1, 23, 24, 43, 44, 2]]),
+            )
+        )
+        self.assertTrue(
+            torch.allclose(
+                _pad(items, "attention_mask", 0, "right"), torch.LongTensor([[0, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 0]])
+            )
+        )
+
+    @require_torch
+    def test_pipeline_image_padding(self):
+        import torch
+
+        items = [
+            {
+                "label": "label1",
+                "pixel_values": torch.zeros((1, 3, 10, 10)),
+            },
+            {
+                "label": "label2",
+                "pixel_values": torch.zeros((1, 3, 10, 10)),
+            },
+        ]
+
+        self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
+        self.assertTrue(
+            torch.allclose(
+                _pad(items, "pixel_values", 10, "right"),
+                torch.zeros((2, 3, 10, 10)),
+            )
+        )
+
+    @require_torch
+    def test_pipeline_offset_mapping(self):
+        import torch
+
+        items = [
+            {
+                "offset_mappings": torch.zeros([1, 11, 2], dtype=torch.long),
+            },
+            {
+                "offset_mappings": torch.zeros([1, 4, 2], dtype=torch.long),
+            },
+        ]
+
+        self.assertTrue(
+            torch.allclose(
+                _pad(items, "offset_mappings", 0, "right"),
+                torch.zeros((2, 11, 2), dtype=torch.long),
+            ),
+        )
+
+
+@is_pipeline_test
+class PipelineUtilsTest(unittest.TestCase):
+    @require_torch
+    def test_pipeline_dataset(self):
+        from transformers.pipelines.pt_utils import PipelineDataset
+
+        dummy_dataset = [0, 1, 2, 3]
+
+        def add(number, extra=0):
+            return number + extra
+
+        dataset = PipelineDataset(dummy_dataset, add, {"extra": 2})
+        self.assertEqual(len(dataset), 4)
+        outputs = [dataset[i] for i in range(4)]
+        self.assertEqual(outputs, [2, 3, 4, 5])
+
+    @require_torch
+    def test_pipeline_iterator(self):
+        from transformers.pipelines.pt_utils import PipelineIterator
+
+        dummy_dataset = [0, 1, 2, 3]
+
+        def add(number, extra=0):
+            return number + extra
+
+        dataset = PipelineIterator(dummy_dataset, add, {"extra": 2})
+        self.assertEqual(len(dataset), 4)
+
+        outputs = list(dataset)
+        self.assertEqual(outputs, [2, 3, 4, 5])
+
+    @require_torch
+    def test_pipeline_iterator_no_len(self):
+        from transformers.pipelines.pt_utils import PipelineIterator
+
+        def dummy_dataset():
+            yield from range(4)
+
+        def add(number, extra=0):
+            return number + extra
+
+        dataset = PipelineIterator(dummy_dataset(), add, {"extra": 2})
+        with self.assertRaises(TypeError):
+            len(dataset)
+
+        outputs = list(dataset)
+        self.assertEqual(outputs, [2, 3, 4, 5])
+
+    @require_torch
+    def test_pipeline_batch_unbatch_iterator(self):
+        from transformers.pipelines.pt_utils import PipelineIterator
+
+        dummy_dataset = [{"id": [0, 1, 2]}, {"id": [3]}]
+
+        def add(number, extra=0):
+            return {"id": [i + extra for i in number["id"]]}
+
+        dataset = PipelineIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
+
+        outputs = list(dataset)
+        self.assertEqual(outputs, [{"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}])
+
+    @require_torch
+    def test_pipeline_batch_unbatch_iterator_tensors(self):
+        import torch
+
+        from transformers.pipelines.pt_utils import PipelineIterator
+
+        dummy_dataset = [{"id": torch.LongTensor([[10, 20], [0, 1], [0, 2]])}, {"id": torch.LongTensor([[3]])}]
+
+        def add(number, extra=0):
+            return {"id": number["id"] + extra}
+
+        dataset = PipelineIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
+
+        outputs = list(dataset)
+        self.assertEqual(
+            nested_simplify(outputs), [{"id": [[12, 22]]}, {"id": [[2, 3]]}, {"id": [[2, 4]]}, {"id": [[5]]}]
+        )
+
+    @require_torch
+    def test_pipeline_chunk_iterator(self):
+        from transformers.pipelines.pt_utils import PipelineChunkIterator
+
+        def preprocess_chunk(n: int):
+            yield from range(n)
+
+        dataset = [2, 3]
+
+        dataset = PipelineChunkIterator(dataset, preprocess_chunk, {}, loader_batch_size=3)
+
+        outputs = list(dataset)
+
+        self.assertEqual(outputs, [0, 1, 0, 1, 2])
+
+    @require_torch
+    def test_pipeline_pack_iterator(self):
+        from transformers.pipelines.pt_utils import PipelinePackIterator
+
+        def pack(item):
+            return {"id": item["id"] + 1, "is_last": item["is_last"]}
+
+        dataset = [
+            {"id": 0, "is_last": False},
+            {"id": 1, "is_last": True},
+            {"id": 0, "is_last": False},
+            {"id": 1, "is_last": False},
+            {"id": 2, "is_last": True},
+        ]
+
+        dataset = PipelinePackIterator(dataset, pack, {})
+
+        outputs = list(dataset)
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {"id": 1},
+                    {"id": 2},
+                ],
+                [
+                    {"id": 1},
+                    {"id": 2},
+                    {"id": 3},
+                ],
+            ],
+        )
+
+    @require_torch
+    def test_pipeline_pack_unbatch_iterator(self):
+        from transformers.pipelines.pt_utils import PipelinePackIterator
+
+        dummy_dataset = [{"id": [0, 1, 2], "is_last": [False, True, False]}, {"id": [3], "is_last": [True]}]
+
+        def add(number, extra=0):
+            return {"id": [i + extra for i in number["id"]], "is_last": number["is_last"]}
+
+        dataset = PipelinePackIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
+
+        outputs = list(dataset)
+        self.assertEqual(outputs, [[{"id": 2}, {"id": 3}], [{"id": 4}, {"id": 5}]])
+
+        # is_false Across batch
+        dummy_dataset = [{"id": [0, 1, 2], "is_last": [False, False, False]}, {"id": [3], "is_last": [True]}]
+
+        def add(number, extra=0):
+            return {"id": [i + extra for i in number["id"]], "is_last": number["is_last"]}
+
+        dataset = PipelinePackIterator(dummy_dataset, add, {"extra": 2}, loader_batch_size=3)
+
+        outputs = list(dataset)
+        self.assertEqual(outputs, [[{"id": 2}, {"id": 3}, {"id": 4}, {"id": 5}]])
+
+    def test_pipeline_negative_device(self):
+        # To avoid regressing, pipeline used to accept device=-1
+        classifier = pipeline("text-generation", "hf-internal-testing/tiny-random-bert", device=-1)
+
+        expected_output = [{"generated_text": ANY(str)}]
+        actual_output = classifier("Test input.")
+        self.assertEqual(expected_output, actual_output)
+
+    @require_torch_accelerator
+    def test_pipeline_no_device(self):
+        # Test when no device is passed to pipeline
+        import torch
+
+        from transformers import AutoModelForCausalLM
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+        # Case 1: Model is manually moved to device
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-bert", dtype=torch.float16).to(
+            torch_device
+        )
+        model_device = model.device
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        self.assertEqual(pipe.model.device, model_device)
+        # Case 2: Model is loaded by accelerate
+        model = AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-bert", device_map=torch_device, dtype=torch.float16
+        )
+        model_device = model.device
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        self.assertEqual(pipe.model.device, model_device)
+        # Case 3: device_map is passed to model and device is passed to pipeline
+        model = AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-bert", device_map=torch_device, dtype=torch.float16
+        )
+        with self.assertRaises(ValueError):
+            pipe = pipeline("text-generation", model=model, device="cpu", tokenizer=tokenizer)
+
+    @require_torch_multi_accelerator
+    def test_pipeline_device_not_equal_model_device(self):
+        # Test when device ids are different, pipeline should move the model to the passed device id
+        import torch
+
+        from transformers import AutoModelForCausalLM
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+        model_device = f"{torch_device}:1"
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-bert", dtype=torch.float16).to(
+            model_device
+        )
+        target_device = f"{torch_device}:0"
+        self.assertNotEqual(model_device, target_device)
+        pipe = pipeline("text-generation", model=model, device=target_device, tokenizer=tokenizer)
+        self.assertEqual(pipe.model.device, torch.device(target_device))
+
+    @slow
+    @require_torch
+    def test_load_default_pipelines_pt(self):
+        import torch
+
+        from transformers.pipelines import SUPPORTED_TASKS
+
+        set_seed_fn = lambda: torch.manual_seed(0)  # noqa: E731
+        for task in SUPPORTED_TASKS:
+            if task == "table-question-answering":
+                # test table in separate test due to more dependencies
+                continue
+
+            self.check_default_pipeline(task, set_seed_fn, self.check_models_equal_pt)
+
+            # clean-up as much as possible GPU memory occupied by PyTorch
+            gc.collect()
+            backend_empty_cache(torch_device)
+
+    @slow
+    @require_torch
+    def test_load_default_pipelines_pt_table_qa(self):
+        import torch
+
+        set_seed_fn = lambda: torch.manual_seed(0)  # noqa: E731
+        self.check_default_pipeline("table-question-answering", set_seed_fn, self.check_models_equal_pt)
+
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        backend_empty_cache(torch_device)
+
+    @slow
+    @require_torch
+    @require_torch_accelerator
+    def test_pipeline_accelerator(self):
+        pipe = pipeline("text-generation", device=torch_device)
+        _ = pipe("Hello")
+
+    @slow
+    @require_torch
+    @require_torch_accelerator
+    def test_pipeline_accelerator_indexed(self):
+        pipe = pipeline("text-generation", device=torch_device)
+        _ = pipe("Hello")
+
+    @slow
+    @require_torch
+    def test_bc_torch_device(self):
+        import torch
+
+        from transformers.pipelines import get_supported_tasks
+
+        for task in get_supported_tasks():
+            # Check that it works for all dtypes
+            for dtype in ["float16", "bfloat16", "float32", "auto", torch.float16, torch.bfloat16, torch.float32]:
+                pipe_torch_dtype = pipeline(task, torch_dtype=dtype)
+                pipe_dtype = pipeline(task, dtype=dtype)
+                # Make sure all parameters have the same dtype
+                for (k1, v1), (k2, v2) in zip(
+                    pipe_torch_dtype.model.named_parameters(), pipe_dtype.model.named_parameters()
+                ):
+                    self.assertEqual(k1, k2)
+                    self.assertEqual(v1.dtype, v2.dtype)
+
+                pipe_torch_dtype = pipeline(task, model_kwargs={"torch_dtype": dtype})
+                pipe_dtype = pipeline(task, model_kwargs={"dtype": dtype})
+                # Make sure all parameters have the same dtype
+                for (k1, v1), (k2, v2) in zip(
+                    pipe_torch_dtype.model.named_parameters(), pipe_dtype.model.named_parameters()
+                ):
+                    self.assertEqual(k1, k2)
+                    self.assertEqual(v1.dtype, v2.dtype)
+
+    def check_default_pipeline(self, task, set_seed_fn, check_models_equal_fn):
+        from transformers.pipelines import SUPPORTED_TASKS, pipeline
+
+        task_dict = SUPPORTED_TASKS[task]
+        # test to compare pipeline to manually loading the respective model
+        model = None
+        relevant_auto_classes = task_dict["pt"]
+
+        if len(relevant_auto_classes) == 0:
+            # task has no default
+            self.skipTest(f"{task} in pytorch has no default")
+
+        # by default use first class
+        auto_model_cls = relevant_auto_classes[0]
+
+        # retrieve correct model ids
+        model_id, revision = task_dict["default"]["model"]
+
+        model_ids = [model_id]
+        revisions = [revision]
+        tasks = [task]
+
+        # check for equality
+        for model_id, revision, task in zip(model_ids, revisions, tasks):
+            # load default model
+            try:
+                set_seed_fn()
+                model = auto_model_cls.from_pretrained(model_id, revision=revision)
+            except ValueError:
+                # first auto class is possible not compatible with model, go to next model class
+                auto_model_cls = relevant_auto_classes[1]
+                set_seed_fn()
+                model = auto_model_cls.from_pretrained(model_id, revision=revision)
+
+            # load default pipeline
+            set_seed_fn()
+            default_pipeline = pipeline(task)
+
+            # compare pipeline model with default model
+            models_are_equal = check_models_equal_fn(default_pipeline.model, model)
+            self.assertTrue(models_are_equal, f"{task} model doesn't match pipeline.")
+
+    def check_models_equal_pt(self, model1, model2):
+        models_are_equal = True
+        for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
+            if model1_p.data.ne(model2_p.data).sum() > 0:
+                models_are_equal = False
+
+        return models_are_equal
+
+
+class CustomPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "maybe_arg" in kwargs:
+            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, text, maybe_arg=2):
+        input_ids = self.tokenizer(text, return_tensors="pt")
+        return input_ids
+
+    def _forward(self, model_inputs):
+        outputs = self.model(**model_inputs)
+        return outputs
+
+    def postprocess(self, model_outputs):
+        return model_outputs["logits"].softmax(-1).numpy()
+
+
+@is_pipeline_test
+class CustomPipelineTest(unittest.TestCase):
+    def test_warning_logs(self):
+        transformers_logging.set_verbosity_debug()
+        logger_ = transformers_logging.get_logger("transformers.pipelines.base")
+
+        alias = "text-classification"
+        # Get the original task, so we can restore it at the end.
+        # (otherwise the subsequential tests in `TextClassificationPipelineTests` will fail)
+        _, original_task, _ = PIPELINE_REGISTRY.check_task(alias)
+
+        try:
+            with CaptureLogger(logger_) as cm:
+                PIPELINE_REGISTRY.register_pipeline(alias, PairClassificationPipeline)
+            self.assertIn(f"{alias} is already registered", cm.out)
+        finally:
+            # restore
+            PIPELINE_REGISTRY.supported_tasks[alias] = original_task
+
+    def test_register_pipeline(self):
+        PIPELINE_REGISTRY.register_pipeline(
+            "custom-text-classification",
+            pipeline_class=PairClassificationPipeline,
+            pt_model=AutoModelForSequenceClassification if is_torch_available() else None,
+            default={"model": ("hf-internal-testing/tiny-random-distilbert", "2ef615d")},
+            type="text",
+        )
+        assert "custom-text-classification" in PIPELINE_REGISTRY.get_supported_tasks()
+
+        _, task_def, _ = PIPELINE_REGISTRY.check_task("custom-text-classification")
+        self.assertEqual(task_def["pt"], (AutoModelForSequenceClassification,) if is_torch_available() else ())
+        self.assertEqual(task_def["type"], "text")
+        self.assertEqual(task_def["impl"], PairClassificationPipeline)
+        self.assertEqual(task_def["default"], {"model": ("hf-internal-testing/tiny-random-distilbert", "2ef615d")})
+
+        # Clean registry for next tests.
+        del PIPELINE_REGISTRY.supported_tasks["custom-text-classification"]
+
+    @require_torch
+    def test_dynamic_pipeline(self):
+        PIPELINE_REGISTRY.register_pipeline(
+            "pair-classification",
+            pipeline_class=PairClassificationPipeline,
+            pt_model=AutoModelForSequenceClassification if is_torch_available() else None,
+        )
+
+        classifier = pipeline("pair-classification", model="hf-internal-testing/tiny-random-bert")
+
+        # Clean registry as we won't need the pipeline to be in it for the rest to work.
+        del PIPELINE_REGISTRY.supported_tasks["pair-classification"]
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            classifier.save_pretrained(tmp_dir)
+            # checks
+            self.assertDictEqual(
+                classifier.model.config.custom_pipelines,
+                {
+                    "pair-classification": {
+                        "impl": "custom_pipeline.PairClassificationPipeline",
+                        "pt": ("AutoModelForSequenceClassification",) if is_torch_available() else (),
+                    }
+                },
+            )
+            # Fails if the user forget to pass along `trust_remote_code=True`
+            with self.assertRaises(ValueError):
+                _ = pipeline(model=tmp_dir)
+
+            new_classifier = pipeline(model=tmp_dir, trust_remote_code=True)
+            # Using trust_remote_code=False forces the traditional pipeline tag
+            old_classifier = pipeline("text-classification", model=tmp_dir, trust_remote_code=False)
+        # Can't make an isinstance check because the new_classifier is from the PairClassificationPipeline class of a
+        # dynamic module
+        self.assertEqual(new_classifier.__class__.__name__, "PairClassificationPipeline")
+        self.assertEqual(new_classifier.task, "pair-classification")
+        results = new_classifier("I hate you", second_text="I love you")
+        self.assertDictEqual(
+            nested_simplify(results),
+            {"label": "LABEL_0", "score": 0.505, "logits": [-0.003, -0.024]},
+        )
+
+        self.assertEqual(old_classifier.__class__.__name__, "TextClassificationPipeline")
+        self.assertEqual(old_classifier.task, "text-classification")
+        results = old_classifier("I hate you", text_pair="I love you")
+        self.assertListEqual(
+            nested_simplify(results),
+            [{"label": "LABEL_0", "score": 0.505}],
+        )
+
+    @require_torch
+    def test_cached_pipeline_has_minimum_calls_to_head(self):
+        # Make sure we have cached the pipeline.
+        _ = pipeline("text-classification", model="hf-internal-testing/tiny-random-bert")
+        with RequestCounter() as counter:
+            _ = pipeline("text-classification", model="hf-internal-testing/tiny-random-bert")
+        self.assertEqual(counter["GET"], 0)
+        self.assertEqual(counter["HEAD"], 1)
+        self.assertEqual(counter.total_calls, 1)
+
+    @require_torch
+    def test_chunk_pipeline_batching_single_file(self):
+        # Make sure we have cached the pipeline.
+        pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC")
+        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC")
+        # For some reason scoping doesn't work if not using `self.`
+        self.COUNT = 0
+        forward = pipe.model.forward
+
+        def new_forward(*args, **kwargs):
+            self.COUNT += 1
+            return forward(*args, **kwargs)
+
+        pipe.model.forward = new_forward
+
+        for out in pipe(audio, return_timestamps="char", chunk_length_s=3, stride_length_s=[1, 1], batch_size=1024):
+            pass
+
+        self.assertEqual(self.COUNT, 1)
+
+    @require_torch
+    def test_custom_code_with_string_tokenizer(self):
+        # This test checks for an edge case - tokenizer loading used to fail when using a custom code model
+        # with a separate tokenizer that was passed as a repo name rather than a tokenizer object.
+        # See https://github.com/huggingface/transformers/issues/31669
+        text_generator = pipeline(
+            "text-generation",
+            model="hf-internal-testing/tiny-random-custom-architecture",
+            tokenizer="hf-internal-testing/tiny-random-custom-architecture",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(text_generator, TextGenerationPipeline)  # Assert successful loading
+
+    @require_torch
+    def test_custom_code_with_string_feature_extractor(self):
+        speech_recognizer = pipeline(
+            "automatic-speech-recognition",
+            model="hf-internal-testing/fake-custom-wav2vec2",
+            feature_extractor="hf-internal-testing/fake-custom-wav2vec2",
+            tokenizer="facebook/wav2vec2-base-960h",  # Test workaround - the pipeline requires a tokenizer
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(speech_recognizer, AutomaticSpeechRecognitionPipeline)  # Assert successful loading
+
+    @require_torch
+    def test_custom_code_with_string_preprocessor(self):
+        mask_generator = pipeline(
+            "mask-generation",
+            model="hf-internal-testing/fake-custom-sam",
+            processor="hf-internal-testing/fake-custom-sam",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(mask_generator, MaskGenerationPipeline)  # Assert successful loading
+
+
+@require_torch
+@is_staging_test
+class DynamicPipelineTester(unittest.TestCase):
+    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "I", "love", "hate", "you"]
+
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            delete_repo(token=cls._token, repo_id="test-dynamic-pipeline")
+        except HfHubHTTPError:
+            pass
+
+    @unittest.skip("Broken, TODO @Yih-Dar")
+    def test_push_to_hub_dynamic_pipeline(self):
+        from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
+
+        PIPELINE_REGISTRY.register_pipeline(
+            "pair-classification",
+            pipeline_class=PairClassificationPipeline,
+            pt_model=AutoModelForSequenceClassification,
+        )
+
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertForSequenceClassification(config).eval()
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+
+            classifier = pipeline("pair-classification", model=model, tokenizer=tokenizer)
+
+            # Clean registry as we won't need the pipeline to be in it for the rest to work.
+            del PIPELINE_REGISTRY.supported_tasks["pair-classification"]
+
+            classifier.save_pretrained(tmp_dir)
+            # checks if the configuration has been added after calling the save_pretrained method
+            self.assertDictEqual(
+                classifier.model.config.custom_pipelines,
+                {
+                    "pair-classification": {
+                        "impl": "custom_pipeline.PairClassificationPipeline",
+                        "pt": ("AutoModelForSequenceClassification",),
+                    }
+                },
+            )
+            # use push_to_hub method to push the pipeline
+            classifier.push_to_hub(f"{USER}/test-dynamic-pipeline", token=self._token)
+
+        # Fails if the user forget to pass along `trust_remote_code=True`
+        with self.assertRaises(ValueError):
+            _ = pipeline(model=f"{USER}/test-dynamic-pipeline")
+
+        new_classifier = pipeline(model=f"{USER}/test-dynamic-pipeline", trust_remote_code=True)
+        # Can't make an isinstance check because the new_classifier is from the PairClassificationPipeline class of a
+        # dynamic module
+        self.assertEqual(new_classifier.__class__.__name__, "PairClassificationPipeline")
+        # check for tag exitence, tag needs to be added when we are calling a custom pipeline from the hub
+        # useful for cases such as finetuning
+        self.assertDictEqual(
+            new_classifier.model.config.custom_pipelines,
+            {
+                "pair-classification": {
+                    "impl": f"{USER}/test-dynamic-pipeline--custom_pipeline.PairClassificationPipeline",
+                    "pt": ("AutoModelForSequenceClassification",),
+                }
+            },
+        )
+        # test if the pipeline still works after the model is finetuned
+        # (we are actually testing if the pipeline still works from the final repo)
+        # this is where the user/repo--module.class is used for
+        new_classifier.model.push_to_hub(repo_name=f"{USER}/test-pipeline-for-a-finetuned-model", token=self._token)
+        del new_classifier  # free up memory
+        new_classifier = pipeline(model=f"{USER}/test-pipeline-for-a-finetuned-model", trust_remote_code=True)
+
+        results = classifier("I hate you", second_text="I love you")
+        new_results = new_classifier("I hate you", second_text="I love you")
+        self.assertDictEqual(nested_simplify(results), nested_simplify(new_results))
+
+        # Using trust_remote_code=False forces the traditional pipeline tag
+        old_classifier = pipeline(
+            "text-classification", model=f"{USER}/test-dynamic-pipeline", trust_remote_code=False
+        )
+        self.assertEqual(old_classifier.__class__.__name__, "TextClassificationPipeline")
+        self.assertEqual(old_classifier.task, "text-classification")
+        new_results = old_classifier("I hate you", text_pair="I love you")
+        self.assertListEqual(
+            nested_simplify([{"label": results["label"], "score": results["score"]}]), nested_simplify(new_results)
+        )
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -0,0 +1,161 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import datasets
+from huggingface_hub import DepthEstimationOutput
+from huggingface_hub.utils import insecure_hashlib
+
+from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
+from transformers.pipelines import DepthEstimationPipeline, pipeline
+from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
+    is_pipeline_test,
+    nested_simplify,
+    require_timm,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+def hashimage(image: Image) -> str:
+    m = insecure_hashlib.md5(image.tobytes())
+    return m.hexdigest()
+
+
+@is_pipeline_test
+@require_vision
+@require_timm
+@require_torch
+class DepthEstimationPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            # we use revision="refs/pr/1" until the PR is merged
+            # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
+            )
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        depth_estimator = DepthEstimationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+        return depth_estimator, [
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+
+    def run_pipeline_test(self, depth_estimator, examples):
+        self._load_dataset()
+        outputs = depth_estimator("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        self.assertEqual({"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)}, outputs)
+
+        outputs = depth_estimator(
+            [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                # RGBA
+                self._dataset[0]["image"],
+                # LA
+                self._dataset[1]["image"],
+                # L
+                self._dataset[2]["image"],
+            ]
+        )
+        self.assertEqual(
+            [
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+            ],
+            outputs,
+        )
+
+        for single_output in outputs:
+            compare_pipeline_output_to_hub_spec(single_output, DepthEstimationOutput)
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        model_id = "Intel/dpt-large"
+        depth_estimator = pipeline("depth-estimation", model=model_id)
+        outputs = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg")
+        outputs["depth"] = hashimage(outputs["depth"])
+
+        # This seems flaky.
+        # self.assertEqual(outputs["depth"], "1a39394e282e9f3b0741a90b9f108977")
+        self.assertEqual(nested_simplify(outputs["predicted_depth"].max().item()), 29.306)
+        self.assertEqual(nested_simplify(outputs["predicted_depth"].min().item()), 2.662)
+
+    @require_torch
+    def test_small_model_pt(self):
+        # This is highly irregular to have no small tests.
+        self.skipTest(reason="There is not hf-internal-testing tiny model for either GLPN nor DPT")
+
+    @require_torch
+    def test_multiprocess(self):
+        depth_estimator = pipeline(
+            model="hf-internal-testing/tiny-random-DepthAnythingForDepthEstimation",
+            num_workers=2,
+        )
+        outputs = depth_estimator(
+            [
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            ]
+        )
+        self.assertEqual(
+            [
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+                {"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)},
+            ],
+            outputs,
+        )
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -0,0 +1,424 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import (
+    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
+    AutoTokenizer,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.pipelines import DocumentQuestionAnsweringPipeline, pipeline
+from transformers.pipelines.document_question_answering import apply_tesseract
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_detectron2,
+    require_pytesseract,
+    require_torch,
+    require_torch_bf16,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers.image_utils import load_image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+    def load_image(_):
+        return None
+
+
+# This is a pinned image from a specific revision of a document question answering space, hosted by HuggingFace,
+# so we can expect it to be available.
+INVOICE_URL = (
+    "https://huggingface.co/spaces/impira/docquery/resolve/2f6c96314dc84dfda62d40de9da55f2f5165d403/invoice.png"
+)
+
+
+@is_pipeline_test
+@require_torch
+@require_vision
+class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
+
+    @require_pytesseract
+    @require_vision
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        dqa_pipeline = DocumentQuestionAnsweringPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+            max_new_tokens=20,
+        )
+
+        image = INVOICE_URL
+        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
+        question = "What is the placebo?"
+        examples = [
+            {
+                "image": load_image(image),
+                "question": question,
+            },
+            {
+                "image": image,
+                "question": question,
+            },
+            {
+                "image": image,
+                "question": question,
+                "word_boxes": word_boxes,
+            },
+        ]
+        return dqa_pipeline, examples
+
+    def run_pipeline_test(self, dqa_pipeline, examples):
+        outputs = dqa_pipeline(examples, top_k=2)
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
+                    {"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
+                ]
+            ]
+            * 3,
+        )
+
+    @require_torch
+    @require_detectron2
+    @require_pytesseract
+    def test_small_model_pt(self):
+        dqa_pipeline = pipeline(
+            "document-question-answering", model="hf-internal-testing/tiny-random-layoutlmv2-for-dqa-test"
+        )
+        image = INVOICE_URL
+        question = "How many cats are there?"
+
+        expected_output = [
+            {"score": 0.0001, "answer": "oy 2312/2019", "start": 38, "end": 39},
+            {"score": 0.0001, "answer": "oy 2312/2019 DUE", "start": 38, "end": 40},
+        ]
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
+
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
+
+        # This image does not detect ANY text in it, meaning layoutlmv2 should fail.
+        # Empty answer probably
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(outputs, [])
+
+        # We can optionally pass directly the words and bounding boxes
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        words = []
+        boxes = []
+        outputs = dqa_pipeline(image=image, question=question, words=words, boxes=boxes, top_k=2)
+        self.assertEqual(outputs, [])
+
+    @require_torch
+    @require_torch_bf16
+    @require_detectron2
+    @require_pytesseract
+    def test_small_model_pt_bf16(self):
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="hf-internal-testing/tiny-random-layoutlmv2-for-dqa-test",
+            dtype=torch.bfloat16,
+        )
+        image = INVOICE_URL
+        question = "How many cats are there?"
+
+        expected_output = [
+            {"score": 0.0001, "answer": "oy 2312/2019", "start": 38, "end": 39},
+            {"score": 0.0001, "answer": "oy 2312/2019 DUE", "start": 38, "end": 40},
+        ]
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
+
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
+
+        # This image does not detect ANY text in it, meaning layoutlmv2 should fail.
+        # Empty answer probably
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(outputs, [])
+
+        # We can optionally pass directly the words and bounding boxes
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        words = []
+        boxes = []
+        outputs = dqa_pipeline(image=image, question=question, words=words, boxes=boxes, top_k=2)
+        self.assertEqual(outputs, [])
+
+    # 	 TODO: Enable this once hf-internal-testing/tiny-random-donut is implemented
+    #    @require_torch
+    #    def test_small_model_pt_donut(self):
+    #        dqa_pipeline = pipeline("document-question-answering", model="hf-internal-testing/tiny-random-donut")
+    #        # dqa_pipeline = pipeline("document-question-answering", model="../tiny-random-donut")
+    #        image = "https://templates.invoicehome.com/invoice-template-us-neat-750px.png"
+    #        question = "How many cats are there?"
+    #
+    #        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+    #        self.assertEqual(
+    #            nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]
+    #        )
+
+    @slow
+    @require_torch
+    @require_detectron2
+    @require_pytesseract
+    def test_large_model_pt(self):
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa",
+            revision="9977165",
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9944, "answer": "us-001", "start": 16, "end": 16},
+                {"score": 0.0009, "answer": "us-001", "start": 16, "end": 16},
+            ],
+        )
+
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9944, "answer": "us-001", "start": 16, "end": 16},
+                {"score": 0.0009, "answer": "us-001", "start": 16, "end": 16},
+            ],
+        )
+
+        outputs = dqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9944, "answer": "us-001", "start": 16, "end": 16},
+                    {"score": 0.0009, "answer": "us-001", "start": 16, "end": 16},
+                ],
+            ]
+            * 2,
+        )
+
+    @slow
+    @require_torch
+    @require_detectron2
+    @require_pytesseract
+    def test_large_model_pt_chunk(self):
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa",
+            revision="9977165",
+            max_seq_len=50,
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9974, "answer": "1110212019", "start": 23, "end": 23},
+                {"score": 0.9948, "answer": "us-001", "start": 16, "end": 16},
+            ],
+        )
+
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9974, "answer": "1110212019", "start": 23, "end": 23},
+                {"score": 0.9948, "answer": "us-001", "start": 16, "end": 16},
+            ],
+        )
+
+        outputs = dqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9974, "answer": "1110212019", "start": 23, "end": 23},
+                    {"score": 0.9948, "answer": "us-001", "start": 16, "end": 16},
+                ]
+            ]
+            * 2,
+        )
+
+    @slow
+    @require_torch
+    @require_pytesseract
+    @require_vision
+    def test_large_model_pt_layoutlm(self):
+        tokenizer = AutoTokenizer.from_pretrained(
+            "impira/layoutlm-document-qa", revision="3dc6de3", add_prefix_space=True
+        )
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="impira/layoutlm-document-qa",
+            tokenizer=tokenizer,
+            revision="3dc6de3",
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=3),
+            [
+                {"score": 0.425, "answer": "us-001", "start": 16, "end": 16},
+                {"score": 0.082, "answer": "1110212019", "start": 23, "end": 23},
+            ],
+        )
+
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=3),
+            [
+                {"score": 0.425, "answer": "us-001", "start": 16, "end": 16},
+                {"score": 0.082, "answer": "1110212019", "start": 23, "end": 23},
+            ],
+        )
+
+        outputs = dqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=3),
+            [
+                [
+                    {"score": 0.425, "answer": "us-001", "start": 16, "end": 16},
+                    {"score": 0.082, "answer": "1110212019", "start": 23, "end": 23},
+                ]
+            ]
+            * 2,
+        )
+
+        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
+
+        # This model should also work if `image` is set to None
+        outputs = dqa_pipeline({"image": None, "word_boxes": word_boxes, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=3),
+            [
+                {"score": 0.425, "answer": "us-001", "start": 16, "end": 16},
+                {"score": 0.082, "answer": "1110212019", "start": 23, "end": 23},
+            ],
+        )
+
+    @slow
+    @require_torch
+    @require_pytesseract
+    @require_vision
+    def test_large_model_pt_layoutlm_chunk(self):
+        tokenizer = AutoTokenizer.from_pretrained(
+            "impira/layoutlm-document-qa", revision="3dc6de3", add_prefix_space=True
+        )
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="impira/layoutlm-document-qa",
+            tokenizer=tokenizer,
+            revision="3dc6de3",
+            max_seq_len=50,
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9999, "answer": "us-001", "start": 16, "end": 16},
+                {"score": 0.9998, "answer": "us-001", "start": 16, "end": 16},
+            ],
+        )
+
+        outputs = dqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9999, "answer": "us-001", "start": 16, "end": 16},
+                    {"score": 0.9998, "answer": "us-001", "start": 16, "end": 16},
+                ]
+            ]
+            * 2,
+        )
+
+        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
+
+        # This model should also work if `image` is set to None
+        outputs = dqa_pipeline({"image": None, "word_boxes": word_boxes, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9999, "answer": "us-001", "start": 16, "end": 16},
+                {"score": 0.9998, "answer": "us-001", "start": 16, "end": 16},
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_large_model_pt_donut(self):
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="naver-clova-ix/donut-base-finetuned-docvqa",
+            tokenizer=AutoTokenizer.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa"),
+            image_processor="naver-clova-ix/donut-base-finetuned-docvqa",
+        )
+
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(nested_simplify(outputs, decimals=4), [{"answer": "us-001"}])
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -0,0 +1,158 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import (
+    FEATURE_EXTRACTOR_MAPPING,
+    IMAGE_PROCESSOR_MAPPING,
+    MODEL_MAPPING,
+    FeatureExtractionPipeline,
+    LxmertConfig,
+    is_torch_available,
+    pipeline,
+)
+from transformers.testing_utils import is_pipeline_test, nested_simplify, require_torch
+
+
+if is_torch_available():
+    import torch
+
+
+@is_pipeline_test
+class FeatureExtractionPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_MAPPING
+
+    @require_torch
+    def test_small_model_pt(self):
+        feature_extractor = pipeline(task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert")
+        outputs = feature_extractor("This is a test")
+        self.assertEqual(
+            nested_simplify(outputs),
+            [[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]])  # fmt: skip
+
+    @require_torch
+    def test_tokenization_small_model_pt(self):
+        feature_extractor = pipeline(task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert")
+        # test with empty parameters
+        outputs = feature_extractor("This is a test")
+        self.assertEqual(
+            nested_simplify(outputs),
+            [[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]])  # fmt: skip
+
+        # test with various tokenizer parameters
+        tokenize_kwargs = {"max_length": 3}
+        outputs = feature_extractor("This is a test", tokenize_kwargs=tokenize_kwargs)
+        self.assertEqual(np.squeeze(outputs).shape, (3, 32))
+
+        tokenize_kwargs = {"truncation": True, "padding": True, "max_length": 4}
+        outputs = feature_extractor(
+            ["This is a test", "This", "This is", "This is a", "This is a test test test test"],
+            tokenize_kwargs=tokenize_kwargs,
+        )
+        self.assertEqual(np.squeeze(outputs).shape, (5, 4, 32))
+
+        tokenize_kwargs = {"padding": True, "max_length": 4}
+        outputs = feature_extractor(
+            ["This is a test", "This", "This is", "This is a", "This is a test test test test"],
+            truncation=True,
+            tokenize_kwargs=tokenize_kwargs,
+        )
+        self.assertEqual(np.squeeze(outputs).shape, (5, 4, 32))
+
+        # raise value error if truncation parameter given for two places
+        tokenize_kwargs = {"truncation": True}
+        with self.assertRaises(ValueError):
+            _ = feature_extractor(
+                ["This is a test", "This", "This is", "This is a", "This is a test test test test"],
+                truncation=True,
+                tokenize_kwargs=tokenize_kwargs,
+            )
+
+    @require_torch
+    def test_return_tensors_pt(self):
+        feature_extractor = pipeline(task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert")
+        outputs = feature_extractor("This is a test", return_tensors=True)
+        self.assertTrue(torch.is_tensor(outputs))
+
+    def get_shape(self, input_, shape=None):
+        if shape is None:
+            shape = []
+        if isinstance(input_, list):
+            subshapes = [self.get_shape(in_, shape) for in_ in input_]
+            if all(s == 0 for s in subshapes):
+                shape.append(len(input_))
+            else:
+                subshape = subshapes[0]
+                shape = [len(input_), *subshape]
+        elif isinstance(input_, float):
+            return 0
+        else:
+            raise TypeError("We expect lists of floats, nothing else")
+        return shape
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        if tokenizer is None:
+            self.skipTest(reason="No tokenizer")
+        elif (
+            type(model.config) in FEATURE_EXTRACTOR_MAPPING
+            or isinstance(model.config, LxmertConfig)
+            or type(model.config) in IMAGE_PROCESSOR_MAPPING
+        ):
+            self.skipTest(
+                reason="This is a bimodal model, we need to find a more consistent way to switch on those models."
+            )
+        elif model.config.is_encoder_decoder:
+            self.skipTest(
+                """encoder_decoder models are trickier for this pipeline.
+                Do we want encoder + decoder inputs to get some features?
+                Do we want encoder only features ?
+                For now ignore those.
+                """
+            )
+        feature_extractor_pipeline = FeatureExtractionPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+        return feature_extractor_pipeline, ["This is a test", "This is another test"]
+
+    def run_pipeline_test(self, feature_extractor, examples):
+        outputs = feature_extractor("This is a test")
+
+        shape = self.get_shape(outputs)
+        self.assertEqual(shape[0], 1)
+
+        # If we send too small input
+        # there's a bug within FunnelModel (output with shape [1, 4, 2, 1] doesn't match the broadcast shape [1, 4, 2, 2])
+        outputs = feature_extractor(["This is a test", "Another longer test"])
+        shape = self.get_shape(outputs)
+        self.assertEqual(shape[0], 2)
+
+        outputs = feature_extractor("This is a test" * 100, truncation=True)
+        shape = self.get_shape(outputs)
+        self.assertEqual(shape[0], 1)
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -0,0 +1,412 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+from transformers import MODEL_FOR_MASKED_LM_MAPPING, FillMaskPipeline, pipeline
+from transformers.pipelines import PipelineException
+from transformers.testing_utils import (
+    backend_empty_cache,
+    is_pipeline_test,
+    is_torch_available,
+    nested_simplify,
+    require_torch,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+
+from .test_pipelines_common import ANY
+
+
+@is_pipeline_test
+class FillMaskPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_MASKED_LM_MAPPING
+
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        gc.collect()
+        if is_torch_available():
+            backend_empty_cache(torch_device)
+
+    @require_torch
+    def test_small_model_pt(self):
+        unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2)
+
+        outputs = unmasker("My name is <mask>")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=6),
+            [
+                {"sequence": "My name is Maul", "score": 2.2e-05, "token": 35676, "token_str": " Maul"},
+                {"sequence": "My name isELS", "score": 2.2e-05, "token": 16416, "token_str": "ELS"},
+            ],
+        )
+
+        outputs = unmasker("The largest city in France is <mask>")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=6),
+            [
+                {
+                    "sequence": "The largest city in France is Maul",
+                    "score": 2.2e-05,
+                    "token": 35676,
+                    "token_str": " Maul",
+                },
+                {"sequence": "The largest city in France isELS", "score": 2.2e-05, "token": 16416, "token_str": "ELS"},
+            ],
+        )
+
+        outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=6),
+            [
+                {"sequence": "My name is Patrick", "score": 2.1e-05, "token": 3499, "token_str": " Patrick"},
+                {"sequence": "My name is Te", "score": 2e-05, "token": 2941, "token_str": " Te"},
+                {"sequence": "My name is Clara", "score": 2e-05, "token": 13606, "token_str": " Clara"},
+            ],
+        )
+
+        outputs = unmasker("My name is <mask> <mask>", top_k=2)
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=6),
+            [
+                [
+                    {
+                        "score": 2.2e-05,
+                        "token": 35676,
+                        "token_str": " Maul",
+                        "sequence": "<s>My name is Maul<mask></s>",
+                    },
+                    {"score": 2.2e-05, "token": 16416, "token_str": "ELS", "sequence": "<s>My name isELS<mask></s>"},
+                ],
+                [
+                    {
+                        "score": 2.2e-05,
+                        "token": 35676,
+                        "token_str": " Maul",
+                        "sequence": "<s>My name is<mask> Maul</s>",
+                    },
+                    {"score": 2.2e-05, "token": 16416, "token_str": "ELS", "sequence": "<s>My name is<mask>ELS</s>"},
+                ],
+            ],
+        )
+
+    @require_torch_accelerator
+    def test_fp16_casting(self):
+        pipe = pipeline(
+            "fill-mask",
+            model="hf-internal-testing/tiny-random-distilbert",
+            device=torch_device,
+        )
+
+        # convert model to fp16
+        pipe.model.half()
+
+        response = pipe("Paris is the [MASK] of France.")
+        # We actually don't care about the result, we just want to make sure
+        # it works, meaning the float16 tensor got casted back to float32
+        # for postprocessing.
+        self.assertIsInstance(response, list)
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2)
+        self.run_large_test(unmasker)
+
+    def run_large_test(self, unmasker):
+        outputs = unmasker("My name is <mask>")
+        self.assertEqual(
+            nested_simplify(outputs),
+            [
+                {"sequence": "My name is John", "score": 0.008, "token": 610, "token_str": " John"},
+                {"sequence": "My name is Chris", "score": 0.007, "token": 1573, "token_str": " Chris"},
+            ],
+        )
+        outputs = unmasker("The largest city in France is <mask>")
+        self.assertEqual(
+            nested_simplify(outputs),
+            [
+                {
+                    "sequence": "The largest city in France is Paris",
+                    "score": 0.251,
+                    "token": 2201,
+                    "token_str": " Paris",
+                },
+                {
+                    "sequence": "The largest city in France is Lyon",
+                    "score": 0.214,
+                    "token": 12790,
+                    "token_str": " Lyon",
+                },
+            ],
+        )
+
+        outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
+        self.assertEqual(
+            nested_simplify(outputs),
+            [
+                {"sequence": "My name is Patrick", "score": 0.005, "token": 3499, "token_str": " Patrick"},
+                {"sequence": "My name is Clara", "score": 0.000, "token": 13606, "token_str": " Clara"},
+                {"sequence": "My name is Te", "score": 0.000, "token": 2941, "token_str": " Te"},
+            ],
+        )
+
+        dummy_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit," * 100
+        outputs = unmasker(
+            "My name is <mask>" + dummy_str,
+            tokenizer_kwargs={"truncation": True},
+        )
+        simplified = nested_simplify(outputs, decimals=4)
+        self.assertEqual(
+            [{"sequence": x["sequence"][:100]} for x in simplified],
+            [
+                {"sequence": f"My name is,{dummy_str}"[:100]},
+                {"sequence": f"My name is:,{dummy_str}"[:100]},
+            ],
+        )
+        self.assertEqual(
+            [{k: x[k] for k in x if k != "sequence"} for x in simplified],
+            [
+                {"score": 0.2819, "token": 6, "token_str": ","},
+                {"score": 0.0954, "token": 46686, "token_str": ":,"},
+            ],
+        )
+
+    @require_torch
+    def test_model_no_pad_pt(self):
+        unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base")
+        unmasker.tokenizer.pad_token_id = None
+        unmasker.tokenizer.pad_token = None
+        self.run_pipeline_test(unmasker, [])
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        if tokenizer is None or tokenizer.mask_token_id is None:
+            self.skipTest(reason="The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
+
+        fill_masker = FillMaskPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+        examples = [
+            f"This is another {tokenizer.mask_token} test",
+        ]
+        return fill_masker, examples
+
+    def run_pipeline_test(self, fill_masker, examples):
+        tokenizer = fill_masker.tokenizer
+        model = fill_masker.model
+
+        outputs = fill_masker(
+            f"This is a {tokenizer.mask_token}",
+        )
+        self.assertEqual(
+            outputs,
+            [
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+            ],
+        )
+
+        outputs = fill_masker([f"This is a {tokenizer.mask_token}"])
+        self.assertEqual(
+            outputs,
+            [
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+            ],
+        )
+
+        outputs = fill_masker([f"This is a {tokenizer.mask_token}", f"Another {tokenizer.mask_token} great test."])
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                ],
+                [
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                ],
+            ],
+        )
+
+        with self.assertRaises(ValueError):
+            fill_masker([None])
+        # No mask_token is not supported
+        with self.assertRaises(PipelineException):
+            fill_masker("This is")
+
+        self.run_test_top_k(model, tokenizer)
+        self.run_test_targets(model, tokenizer)
+        self.run_test_top_k_targets(model, tokenizer)
+        self.fill_mask_with_duplicate_targets_and_top_k(model, tokenizer)
+        self.fill_mask_with_multiple_masks(model, tokenizer)
+
+    def run_test_targets(self, model, tokenizer):
+        vocab = tokenizer.get_vocab()
+        targets = sorted(vocab.keys())[:2]
+        # Pipeline argument
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, targets=targets)
+        outputs = fill_masker(f"This is a {tokenizer.mask_token}")
+        self.assertEqual(
+            outputs,
+            [
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+            ],
+        )
+        target_ids = {vocab[el] for el in targets}
+        self.assertEqual({el["token"] for el in outputs}, target_ids)
+        processed_targets = [tokenizer.decode([x]) for x in target_ids]
+        self.assertEqual({el["token_str"] for el in outputs}, set(processed_targets))
+
+        # Call argument
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
+        outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
+        self.assertEqual(
+            outputs,
+            [
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+            ],
+        )
+        target_ids = {vocab[el] for el in targets}
+        self.assertEqual({el["token"] for el in outputs}, target_ids)
+        processed_targets = [tokenizer.decode([x]) for x in target_ids]
+        self.assertEqual({el["token_str"] for el in outputs}, set(processed_targets))
+
+        # Score equivalence
+        outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
+        tokens = [top_mask["token_str"] for top_mask in outputs]
+        scores = [top_mask["score"] for top_mask in outputs]
+
+        # For some BPE tokenizers, `</w>` is removed during decoding, so `token_str` won't be the same as in `targets`.
+        if set(tokens) == set(targets):
+            unmasked_targets = fill_masker(f"This is a {tokenizer.mask_token}", targets=tokens)
+            target_scores = [top_mask["score"] for top_mask in unmasked_targets]
+            self.assertEqual(nested_simplify(scores), nested_simplify(target_scores))
+
+        # Raises with invalid
+        with self.assertRaises(ValueError):
+            outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[])
+        # For some tokenizers, `""` is actually in the vocabulary and the expected error won't raised
+        if "" not in tokenizer.get_vocab():
+            with self.assertRaises(ValueError):
+                outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[""])
+            with self.assertRaises(ValueError):
+                outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets="")
+
+    def run_test_top_k(self, model, tokenizer):
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, top_k=2)
+        outputs = fill_masker(f"This is a {tokenizer.mask_token}")
+        self.assertEqual(
+            outputs,
+            [
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+            ],
+        )
+
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
+        outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=2)
+        self.assertEqual(
+            outputs2,
+            [
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+            ],
+        )
+        self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
+
+    def run_test_top_k_targets(self, model, tokenizer):
+        vocab = tokenizer.get_vocab()
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
+
+        # top_k=2, ntargets=3
+        targets = sorted(vocab.keys())[:3]
+        outputs = fill_masker(f"This is a {tokenizer.mask_token}", top_k=2, targets=targets)
+
+        # If we use the most probably targets, and filter differently, we should still
+        # have the same results
+        targets2 = [el["token_str"] for el in sorted(outputs, key=lambda x: x["score"], reverse=True)]
+        # For some BPE tokenizers, `</w>` is removed during decoding, so `token_str` won't be the same as in `targets`.
+        if set(targets2).issubset(targets):
+            outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=3, targets=targets2)
+            # They should yield exactly the same result
+            self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
+
+    def fill_mask_with_duplicate_targets_and_top_k(self, model, tokenizer):
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
+        vocab = tokenizer.get_vocab()
+        # String duplicates + id duplicates
+        targets = sorted(vocab.keys())[:3]
+        targets = [targets[0], targets[1], targets[0], targets[2], targets[1]]
+        outputs = fill_masker(f"My name is {tokenizer.mask_token}", targets=targets, top_k=10)
+
+        # The target list contains duplicates, so we can't output more
+        # than them
+        self.assertEqual(len(outputs), 3)
+
+    def fill_mask_with_multiple_masks(self, model, tokenizer):
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
+
+        outputs = fill_masker(
+            f"This is a {tokenizer.mask_token} {tokenizer.mask_token} {tokenizer.mask_token}", top_k=2
+        )
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                ],
+                [
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                ],
+                [
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
+                ],
+            ],
+        )
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -0,0 +1,297 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import datasets
+from huggingface_hub import ImageClassificationOutputElement
+
+from transformers import (
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+    PreTrainedTokenizerBase,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.pipelines import ImageClassificationPipeline, pipeline
+from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
+    is_pipeline_test,
+    nested_simplify,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_torch
+@require_vision
+class ImageClassificationPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            # we use revision="refs/pr/1" until the PR is merged
+            # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
+            )
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        image_classifier = ImageClassificationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+            top_k=2,
+        )
+        examples = [
+            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+        ]
+        return image_classifier, examples
+
+    def run_pipeline_test(self, image_classifier, examples):
+        self._load_dataset()
+        outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png")
+
+        self.assertEqual(
+            outputs,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+
+        # Accepts URL + PIL.Image + lists
+        outputs = image_classifier(
+            [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                # RGBA
+                self._dataset[0]["image"],
+                # LA
+                self._dataset[1]["image"],
+                # L
+                self._dataset[2]["image"],
+            ]
+        )
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {"score": ANY(float), "label": ANY(str)},
+                    {"score": ANY(float), "label": ANY(str)},
+                ],
+                [
+                    {"score": ANY(float), "label": ANY(str)},
+                    {"score": ANY(float), "label": ANY(str)},
+                ],
+                [
+                    {"score": ANY(float), "label": ANY(str)},
+                    {"score": ANY(float), "label": ANY(str)},
+                ],
+                [
+                    {"score": ANY(float), "label": ANY(str)},
+                    {"score": ANY(float), "label": ANY(str)},
+                ],
+                [
+                    {"score": ANY(float), "label": ANY(str)},
+                    {"score": ANY(float), "label": ANY(str)},
+                ],
+            ],
+        )
+
+        for single_output in outputs:
+            for output_element in single_output:
+                compare_pipeline_output_to_hub_spec(output_element, ImageClassificationOutputElement)
+
+    @require_torch
+    def test_small_model_pt(self):
+        small_model = "hf-internal-testing/tiny-random-vit"
+        image_classifier = pipeline("image-classification", model=small_model)
+
+        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
+        )
+
+        outputs = image_classifier(
+            [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ],
+            top_k=2,
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
+                [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
+            ],
+        )
+
+    def test_custom_tokenizer(self):
+        tokenizer = PreTrainedTokenizerBase()
+
+        # Assert that the pipeline can be initialized with a feature extractor that is not in any mapping
+        image_classifier = pipeline(
+            "image-classification", model="hf-internal-testing/tiny-random-vit", tokenizer=tokenizer
+        )
+
+        self.assertIs(image_classifier.tokenizer, tokenizer)
+
+    @require_torch
+    def test_torch_float16_pipeline(self):
+        image_classifier = pipeline(
+            "image-classification", model="hf-internal-testing/tiny-random-vit", dtype=torch.float16
+        )
+        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=3),
+            [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
+        )
+
+    @require_torch
+    def test_torch_bfloat16_pipeline(self):
+        image_classifier = pipeline(
+            "image-classification", model="hf-internal-testing/tiny-random-vit", dtype=torch.bfloat16
+        )
+        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=3),
+            [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
+        )
+
+    @slow
+    @require_torch
+    def test_perceiver(self):
+        # Perceiver is not tested by `run_pipeline_test` properly.
+        # That is because the type of feature_extractor and model preprocessor need to be kept
+        # in sync, which is not the case in the current design
+        image_classifier = pipeline("image-classification", model="deepmind/vision-perceiver-conv")
+        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.4385, "label": "tabby, tabby cat"},
+                {"score": 0.321, "label": "tiger cat"},
+                {"score": 0.0502, "label": "Egyptian cat"},
+                {"score": 0.0137, "label": "crib, cot"},
+                {"score": 0.007, "label": "radiator"},
+            ],
+        )
+
+        image_classifier = pipeline("image-classification", model="deepmind/vision-perceiver-fourier")
+        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.5658, "label": "tabby, tabby cat"},
+                {"score": 0.1309, "label": "tiger cat"},
+                {"score": 0.0722, "label": "Egyptian cat"},
+                {"score": 0.0707, "label": "remote control, remote"},
+                {"score": 0.0082, "label": "computer keyboard, keypad"},
+            ],
+        )
+
+        image_classifier = pipeline("image-classification", model="deepmind/vision-perceiver-learned")
+        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.3022, "label": "tabby, tabby cat"},
+                {"score": 0.2362, "label": "Egyptian cat"},
+                {"score": 0.1856, "label": "tiger cat"},
+                {"score": 0.0324, "label": "remote control, remote"},
+                {"score": 0.0096, "label": "quilt, comforter, comfort, puff"},
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_multilabel_classification(self):
+        small_model = "hf-internal-testing/tiny-random-vit"
+
+        # Sigmoid is applied for multi-label classification
+        image_classifier = pipeline("image-classification", model=small_model)
+        image_classifier.model.config.problem_type = "multi_label_classification"
+
+        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [{"label": "LABEL_1", "score": 0.5356}, {"label": "LABEL_0", "score": 0.4612}],
+        )
+
+        outputs = image_classifier(
+            [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ]
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [{"label": "LABEL_1", "score": 0.5356}, {"label": "LABEL_0", "score": 0.4612}],
+                [{"label": "LABEL_1", "score": 0.5356}, {"label": "LABEL_0", "score": 0.4612}],
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_function_to_apply(self):
+        small_model = "hf-internal-testing/tiny-random-vit"
+
+        # Sigmoid is applied for multi-label classification
+        image_classifier = pipeline("image-classification", model=small_model)
+
+        outputs = image_classifier(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            function_to_apply="sigmoid",
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [{"label": "LABEL_1", "score": 0.5356}, {"label": "LABEL_0", "score": 0.4612}],
+        )
--- a/tests/pipelines/test_pipelines_image_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_image_feature_extraction.py
@@ -0,0 +1,139 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import (
+    MODEL_MAPPING,
+    TOKENIZER_MAPPING,
+    ImageFeatureExtractionPipeline,
+    is_torch_available,
+    is_vision_available,
+    pipeline,
+)
+from transformers.testing_utils import is_pipeline_test, nested_simplify, require_torch
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@is_pipeline_test
+class ImageFeatureExtractionPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_MAPPING
+
+    @require_torch
+    def test_small_model_pt(self):
+        feature_extractor = pipeline(task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit")
+        img = prepare_img()
+        outputs = feature_extractor(img)
+        self.assertEqual(
+            nested_simplify(outputs[0][0]),
+            [-1.417, -0.392, -1.264, -1.196, 1.648, 0.885, 0.56, -0.606, -1.175, 0.823, 1.912, 0.081, -0.053, 1.119, -0.062, -1.757, -0.571, 0.075, 0.959, 0.118, 1.201, -0.672, -0.498, 0.364, 0.937, -1.623, 0.228, 0.19, 1.697, -1.115, 0.583, -0.981])  # fmt: skip
+
+    @require_torch
+    def test_small_model_w_pooler_pt(self):
+        feature_extractor = pipeline(
+            task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit-w-pooler"
+        )
+        img = prepare_img()
+        outputs = feature_extractor(img, pool=True)
+        self.assertEqual(
+            nested_simplify(outputs[0]),
+            [-0.056,  0.083,  0.021,  0.038,  0.242, -0.279, -0.033, -0.003, 0.200, -0.192,  0.045, -0.095, -0.077,  0.017, -0.058, -0.063, -0.029, -0.204,  0.014,  0.042,  0.305, -0.205, -0.099,  0.146, -0.287,  0.020,  0.168, -0.052,  0.046,  0.048, -0.156,  0.093])  # fmt: skip
+
+    @require_torch
+    def test_image_processing_small_model_pt(self):
+        feature_extractor = pipeline(task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit")
+
+        # test with image processor parameters
+        image_processor_kwargs = {"size": {"height": 300, "width": 300}}
+        img = prepare_img()
+        with pytest.raises(ValueError):
+            # Image doesn't match model input size
+            feature_extractor(img, image_processor_kwargs=image_processor_kwargs)
+
+        image_processor_kwargs = {"image_mean": [0, 0, 0], "image_std": [1, 1, 1]}
+        img = prepare_img()
+        outputs = feature_extractor(img, image_processor_kwargs=image_processor_kwargs)
+        self.assertEqual(np.squeeze(outputs).shape, (226, 32))
+
+        # Test pooling option
+        outputs = feature_extractor(img, pool=True)
+        self.assertEqual(np.squeeze(outputs).shape, (32,))
+
+    @require_torch
+    def test_return_tensors_pt(self):
+        feature_extractor = pipeline(task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit")
+        img = prepare_img()
+        outputs = feature_extractor(img, return_tensors=True)
+        self.assertTrue(torch.is_tensor(outputs))
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        if image_processor is None:
+            self.skipTest(reason="No image processor")
+
+        elif type(model.config) in TOKENIZER_MAPPING:
+            self.skipTest(
+                reason="This is a bimodal model, we need to find a more consistent way to switch on those models."
+            )
+
+        elif model.config.is_encoder_decoder:
+            self.skipTest(
+                """encoder_decoder models are trickier for this pipeline.
+                Do we want encoder + decoder inputs to get some features?
+                Do we want encoder only features ?
+                For now ignore those.
+                """
+            )
+
+        feature_extractor_pipeline = ImageFeatureExtractionPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+        img = prepare_img()
+        return feature_extractor_pipeline, [img, img]
+
+    def run_pipeline_test(self, feature_extractor, examples):
+        imgs = examples
+        outputs = feature_extractor(imgs[0])
+
+        self.assertEqual(len(outputs), 1)
+
+        outputs = feature_extractor(imgs)
+        self.assertEqual(len(outputs), 2)
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -0,0 +1,763 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import tempfile
+import unittest
+
+import datasets
+import httpx
+import numpy as np
+from datasets import load_dataset
+from huggingface_hub import ImageSegmentationOutputElement
+from huggingface_hub.utils import insecure_hashlib
+
+from transformers import (
+    MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+    MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
+    AutoImageProcessor,
+    AutoModelForImageSegmentation,
+    AutoModelForInstanceSegmentation,
+    DetrForSegmentation,
+    ImageSegmentationPipeline,
+    MaskFormerForInstanceSegmentation,
+    is_vision_available,
+    pipeline,
+)
+from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
+    is_pipeline_test,
+    nested_simplify,
+    require_timm,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+def hashimage(image: Image) -> str:
+    m = insecure_hashlib.md5(image.tobytes())
+    return m.hexdigest()[:10]
+
+
+def mask_to_test_readable(mask: Image) -> dict:
+    npimg = np.array(mask)
+    white_pixels = (npimg == 255).sum()
+    shape = npimg.shape
+    return {"hash": hashimage(mask), "white_pixels": white_pixels, "shape": shape}
+
+
+def mask_to_test_readable_only_shape(mask: Image) -> dict:
+    npimg = np.array(mask)
+    shape = npimg.shape
+    return {"shape": shape}
+
+
+@is_pipeline_test
+@require_vision
+@require_timm
+@require_torch
+class ImageSegmentationPipelineTests(unittest.TestCase):
+    model_mapping = dict(
+        (list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else [])
+        + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
+        + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
+    )
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            # we use revision="refs/pr/1" until the PR is merged
+            # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
+            )
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        image_segmenter = ImageSegmentationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+        return image_segmenter, [
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+
+    def run_pipeline_test(self, image_segmenter, examples):
+        self._load_dataset()
+        outputs = image_segmenter(
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            threshold=0.0,
+            mask_threshold=0,
+            overlap_mask_area_threshold=0,
+        )
+        self.assertIsInstance(outputs, list)
+        n = len(outputs)
+        if isinstance(image_segmenter.model, (MaskFormerForInstanceSegmentation, DetrForSegmentation)):
+            # Instance segmentation (maskformer, and detr) have a slot for null class
+            # and can output nothing even with a low threshold
+            self.assertGreaterEqual(n, 0)
+        else:
+            self.assertGreaterEqual(n, 1)
+        # XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison
+        # to make it work
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)
+
+        # RGBA
+        outputs = image_segmenter(
+            self._dataset[0]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0
+        )
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+        # LA
+        outputs = image_segmenter(
+            self._dataset[1]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0
+        )
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+        # L
+        outputs = image_segmenter(
+            self._dataset[2]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0
+        )
+        m = len(outputs)
+        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
+
+        if isinstance(image_segmenter.model, DetrForSegmentation):
+            # We need to test batch_size with images with the same size.
+            # Detr doesn't normalize the size of the images, meaning we can have
+            # 800x800 or 800x1200, meaning we cannot batch simply.
+            # We simply bail on this
+            batch_size = 1
+        else:
+            batch_size = 2
+
+        # 5 times the same image so the output shape is predictable
+        batch = [
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+        outputs = image_segmenter(
+            batch,
+            threshold=0.0,
+            mask_threshold=0,
+            overlap_mask_area_threshold=0,
+            batch_size=batch_size,
+        )
+        self.assertEqual(len(batch), len(outputs))
+        self.assertEqual(len(outputs[0]), n)
+        self.assertEqual(
+            [
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
+            ],
+            outputs,
+            f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}",
+        )
+
+        for single_output in outputs:
+            for output_element in single_output:
+                compare_pipeline_output_to_hub_spec(output_element, ImageSegmentationOutputElement)
+
+    @require_torch
+    def test_small_model_pt_no_panoptic(self):
+        model_id = "hf-internal-testing/tiny-random-mobilevit"
+        # The default task is `image-classification` we need to override
+        pipe = pipeline(task="image-segmentation", model=model_id)
+
+        # This model does NOT support neither `instance` nor  `panoptic`
+        # We should error out
+        with self.assertRaises(ValueError) as e:
+            pipe("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="panoptic")
+        self.assertEqual(
+            str(e.exception),
+            "Subtask panoptic is not supported for model <class"
+            " 'transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation'>",
+        )
+        with self.assertRaises(ValueError) as e:
+            pipe("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="instance")
+        self.assertEqual(
+            str(e.exception),
+            "Subtask instance is not supported for model <class"
+            " 'transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation'>",
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
+
+        model = AutoModelForImageSegmentation.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+        image_segmenter = ImageSegmentationPipeline(
+            model=model,
+            image_processor=image_processor,
+            subtask="panoptic",
+            threshold=0.0,
+            mask_threshold=0.0,
+            overlap_mask_area_threshold=0.0,
+        )
+
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+        )
+
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        # This is extremely brittle, and those values are made specific for the CI.
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.004,
+                    "label": "LABEL_215",
+                    "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
+                },
+            ],
+        )
+
+        outputs = image_segmenter(
+            [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ],
+        )
+        for output in outputs:
+            for o in output:
+                o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {
+                        "score": 0.004,
+                        "label": "LABEL_215",
+                        "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
+                    },
+                ],
+                [
+                    {
+                        "score": 0.004,
+                        "label": "LABEL_215",
+                        "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
+                    },
+                ],
+            ],
+        )
+
+        output = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="instance")
+        for o in output:
+            o["mask"] = mask_to_test_readable(o["mask"])
+        self.assertEqual(
+            nested_simplify(output, decimals=4),
+            [
+                {
+                    "score": 0.004,
+                    "label": "LABEL_215",
+                    "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
+                },
+            ],
+        )
+
+        # This must be surprising to the reader.
+        # The `panoptic` returns only LABEL_215, and this returns 3 labels.
+        #
+        output = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="semantic")
+
+        output_masks = [o["mask"] for o in output]
+
+        # page links (to visualize)
+        expected_masks = [
+            "https://huggingface.co/datasets/hf-internal-testing/mask-for-image-segmentation-tests/blob/main/mask_0.png",
+            "https://huggingface.co/datasets/hf-internal-testing/mask-for-image-segmentation-tests/blob/main/mask_1.png",
+            "https://huggingface.co/datasets/hf-internal-testing/mask-for-image-segmentation-tests/blob/main/mask_2.png",
+        ]
+        # actual links to get files
+        expected_masks = [x.replace("/blob/", "/resolve/") for x in expected_masks]
+        expected_masks = [
+            Image.open(io.BytesIO(httpx.get(image, follow_redirects=True).content)) for image in expected_masks
+        ]
+
+        # Convert masks to numpy array
+        output_masks = [np.array(x) for x in output_masks]
+        expected_masks = [np.array(x) for x in expected_masks]
+
+        self.assertEqual(output_masks[0].shape, expected_masks[0].shape)
+        self.assertEqual(output_masks[1].shape, expected_masks[1].shape)
+        self.assertEqual(output_masks[2].shape, expected_masks[2].shape)
+
+        # With un-trained tiny random models, the output `logits` tensor is very likely to contain many values
+        # close to each other, which cause `argmax` to give quite different results when running the test on 2
+        # environments. We use a lower threshold `0.9` here to avoid flakiness.
+        self.assertGreaterEqual(np.mean(output_masks[0] == expected_masks[0]), 0.9)
+        self.assertGreaterEqual(np.mean(output_masks[1] == expected_masks[1]), 0.9)
+        self.assertGreaterEqual(np.mean(output_masks[2] == expected_masks[2]), 0.9)
+
+        for o in output:
+            o["mask"] = mask_to_test_readable_only_shape(o["mask"])
+        self.maxDiff = None
+        self.assertEqual(
+            nested_simplify(output, decimals=4),
+            [
+                {
+                    "label": "LABEL_88",
+                    "mask": {"shape": (480, 640)},
+                    "score": None,
+                },
+                {
+                    "label": "LABEL_101",
+                    "mask": {"shape": (480, 640)},
+                    "score": None,
+                },
+                {
+                    "label": "LABEL_215",
+                    "mask": {"shape": (480, 640)},
+                    "score": None,
+                },
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt_semantic(self):
+        model_id = "hf-internal-testing/tiny-random-beit-pipeline"
+        image_segmenter = pipeline(model=model_id)
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
+        for o in outputs:
+            # shortening by hashing
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": None,
+                    "label": "LABEL_0",
+                    "mask": {"hash": "42d0907228", "shape": (480, 640), "white_pixels": 10714},
+                },
+                {
+                    "score": None,
+                    "label": "LABEL_1",
+                    "mask": {"hash": "46b8cc3976", "shape": (480, 640), "white_pixels": 296486},
+                },
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_integration_torch_image_segmentation(self):
+        model_id = "facebook/detr-resnet-50-panoptic"
+        image_segmenter = pipeline(
+            "image-segmentation",
+            model=model_id,
+            threshold=0.0,
+            overlap_mask_area_threshold=0.0,
+        )
+
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+        )
+
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.9094,
+                    "label": "blanket",
+                    "mask": {"hash": "dcff19a97a", "shape": (480, 640), "white_pixels": 16617},
+                },
+                {
+                    "score": 0.9941,
+                    "label": "cat",
+                    "mask": {"hash": "9c0af87bd0", "shape": (480, 640), "white_pixels": 59185},
+                },
+                {
+                    "score": 0.9987,
+                    "label": "remote",
+                    "mask": {"hash": "c7870600d6", "shape": (480, 640), "white_pixels": 4182},
+                },
+                {
+                    "score": 0.9995,
+                    "label": "remote",
+                    "mask": {"hash": "ef899a25fd", "shape": (480, 640), "white_pixels": 2275},
+                },
+                {
+                    "score": 0.9722,
+                    "label": "couch",
+                    "mask": {"hash": "37b8446ac5", "shape": (480, 640), "white_pixels": 172380},
+                },
+                {
+                    "score": 0.9994,
+                    "label": "cat",
+                    "mask": {"hash": "6a09d3655e", "shape": (480, 640), "white_pixels": 52561},
+                },
+            ],
+        )
+
+        outputs = image_segmenter(
+            [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ],
+        )
+
+        # Shortening by hashing
+        for output in outputs:
+            for o in output:
+                o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {
+                        "score": 0.9094,
+                        "label": "blanket",
+                        "mask": {"hash": "dcff19a97a", "shape": (480, 640), "white_pixels": 16617},
+                    },
+                    {
+                        "score": 0.9941,
+                        "label": "cat",
+                        "mask": {"hash": "9c0af87bd0", "shape": (480, 640), "white_pixels": 59185},
+                    },
+                    {
+                        "score": 0.9987,
+                        "label": "remote",
+                        "mask": {"hash": "c7870600d6", "shape": (480, 640), "white_pixels": 4182},
+                    },
+                    {
+                        "score": 0.9995,
+                        "label": "remote",
+                        "mask": {"hash": "ef899a25fd", "shape": (480, 640), "white_pixels": 2275},
+                    },
+                    {
+                        "score": 0.9722,
+                        "label": "couch",
+                        "mask": {"hash": "37b8446ac5", "shape": (480, 640), "white_pixels": 172380},
+                    },
+                    {
+                        "score": 0.9994,
+                        "label": "cat",
+                        "mask": {"hash": "6a09d3655e", "shape": (480, 640), "white_pixels": 52561},
+                    },
+                ],
+                [
+                    {
+                        "score": 0.9094,
+                        "label": "blanket",
+                        "mask": {"hash": "dcff19a97a", "shape": (480, 640), "white_pixels": 16617},
+                    },
+                    {
+                        "score": 0.9941,
+                        "label": "cat",
+                        "mask": {"hash": "9c0af87bd0", "shape": (480, 640), "white_pixels": 59185},
+                    },
+                    {
+                        "score": 0.9987,
+                        "label": "remote",
+                        "mask": {"hash": "c7870600d6", "shape": (480, 640), "white_pixels": 4182},
+                    },
+                    {
+                        "score": 0.9995,
+                        "label": "remote",
+                        "mask": {"hash": "ef899a25fd", "shape": (480, 640), "white_pixels": 2275},
+                    },
+                    {
+                        "score": 0.9722,
+                        "label": "couch",
+                        "mask": {"hash": "37b8446ac5", "shape": (480, 640), "white_pixels": 172380},
+                    },
+                    {
+                        "score": 0.9994,
+                        "label": "cat",
+                        "mask": {"hash": "6a09d3655e", "shape": (480, 640), "white_pixels": 52561},
+                    },
+                ],
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_threshold(self):
+        model_id = "facebook/detr-resnet-50-panoptic"
+        image_segmenter = pipeline("image-segmentation", model=model_id)
+
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.999)
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.9995,
+                    "label": "remote",
+                    "mask": {"hash": "d02404f578", "shape": (480, 640), "white_pixels": 2789},
+                },
+                {
+                    "score": 0.9994,
+                    "label": "cat",
+                    "mask": {"hash": "eaa115b40c", "shape": (480, 640), "white_pixels": 304411},
+                },
+            ],
+        )
+
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.5)
+
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.9941,
+                    "label": "cat",
+                    "mask": {"hash": "9c0af87bd0", "shape": (480, 640), "white_pixels": 59185},
+                },
+                {
+                    "score": 0.9987,
+                    "label": "remote",
+                    "mask": {"hash": "c7870600d6", "shape": (480, 640), "white_pixels": 4182},
+                },
+                {
+                    "score": 0.9995,
+                    "label": "remote",
+                    "mask": {"hash": "ef899a25fd", "shape": (480, 640), "white_pixels": 2275},
+                },
+                {
+                    "score": 0.9722,
+                    "label": "couch",
+                    "mask": {"hash": "37b8446ac5", "shape": (480, 640), "white_pixels": 172380},
+                },
+                {
+                    "score": 0.9994,
+                    "label": "cat",
+                    "mask": {"hash": "6a09d3655e", "shape": (480, 640), "white_pixels": 52561},
+                },
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_maskformer(self):
+        threshold = 0.8
+        model_id = "facebook/maskformer-swin-base-ade"
+
+        model = AutoModelForInstanceSegmentation.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+
+        image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)
+
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = ds[0]["image"].convert("RGB")
+        outputs = image_segmenter(image, threshold=threshold)
+
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.9974,
+                    "label": "wall",
+                    "mask": {"hash": "a547b7c062", "shape": (512, 683), "white_pixels": 14252},
+                },
+                {
+                    "score": 0.949,
+                    "label": "house",
+                    "mask": {"hash": "0da9b7b38f", "shape": (512, 683), "white_pixels": 132177},
+                },
+                {
+                    "score": 0.9995,
+                    "label": "grass",
+                    "mask": {"hash": "1d07ea0a26", "shape": (512, 683), "white_pixels": 53444},
+                },
+                {
+                    "score": 0.9976,
+                    "label": "tree",
+                    "mask": {"hash": "6cdc97c7da", "shape": (512, 683), "white_pixels": 7944},
+                },
+                {
+                    "score": 0.8239,
+                    "label": "plant",
+                    "mask": {"hash": "1ab4ce378f", "shape": (512, 683), "white_pixels": 4136},
+                },
+                {
+                    "score": 0.9942,
+                    "label": "road, route",
+                    "mask": {"hash": "39c5d17be5", "shape": (512, 683), "white_pixels": 1941},
+                },
+                {
+                    "score": 1.0,
+                    "label": "sky",
+                    "mask": {"hash": "a3756324a6", "shape": (512, 683), "white_pixels": 135802},
+                },
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_oneformer(self):
+        image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")
+
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = ds[0]["image"].convert("RGB")
+        outputs = image_segmenter(image, threshold=0.99)
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.9981,
+                    "label": "grass",
+                    "mask": {"hash": "3a92904d4c", "white_pixels": 118131, "shape": (512, 683)},
+                },
+                {
+                    "score": 0.9992,
+                    "label": "sky",
+                    "mask": {"hash": "fa2300cc9a", "white_pixels": 231565, "shape": (512, 683)},
+                },
+            ],
+        )
+
+        # Different task
+        outputs = image_segmenter(image, threshold=0.99, subtask="instance")
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": 0.9991,
+                    "label": "sky",
+                    "mask": {"hash": "8b1ffad016", "white_pixels": 230566, "shape": (512, 683)},
+                },
+                {
+                    "score": 0.9981,
+                    "label": "grass",
+                    "mask": {"hash": "9bbdf83d3d", "white_pixels": 119130, "shape": (512, 683)},
+                },
+            ],
+        )
+
+        # Different task
+        outputs = image_segmenter(image, subtask="semantic")
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = mask_to_test_readable(o["mask"])
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {
+                    "score": None,
+                    "label": "wall",
+                    "mask": {"hash": "897fb20b7f", "white_pixels": 14506, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "building",
+                    "mask": {"hash": "f2a68c63e4", "white_pixels": 125019, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "sky",
+                    "mask": {"hash": "e0ca3a548e", "white_pixels": 135330, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "tree",
+                    "mask": {"hash": "7c9544bcac", "white_pixels": 16263, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "road, route",
+                    "mask": {"hash": "2c7704e491", "white_pixels": 2143, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "grass",
+                    "mask": {"hash": "bf6c2867e0", "white_pixels": 53040, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "plant",
+                    "mask": {"hash": "93c4b7199e", "white_pixels": 3335, "shape": (512, 683)},
+                },
+                {
+                    "score": None,
+                    "label": "house",
+                    "mask": {"hash": "93ec419ad5", "white_pixels": 60, "shape": (512, 683)},
+                },
+            ],
+        )
+
+    def test_save_load(self):
+        model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
+
+        model = AutoModelForImageSegmentation.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+        image_segmenter = pipeline(
+            task="image-segmentation",
+            model=model,
+            image_processor=image_processor,
+        )
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            image_segmenter.save_pretrained(tmpdirname)
+            pipeline(task="image-segmentation", model=tmpdirname)
--- a/tests/pipelines/test_pipelines_image_text_to_text.py
+++ b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -0,0 +1,506 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import unittest
+
+from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available
+from transformers.pipelines import ImageTextToTextPipeline, pipeline
+from transformers.testing_utils import (
+    Expectations,
+    is_pipeline_test,
+    require_deterministic_for_xpu,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+class ImageTextToTextPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+    def get_test_pipeline(self, model, tokenizer, processor, image_processor, dtype="float32"):
+        pipe = ImageTextToTextPipeline(model=model, processor=processor, dtype=dtype, max_new_tokens=10)
+        image_token = getattr(processor.tokenizer, "image_token", "")
+        examples = [
+            {
+                "images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "text": f"{image_token}This is a ",
+            },
+            {
+                "images": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "text": f"{image_token}Here I see a ",
+            },
+        ]
+        return pipe, examples
+
+    def run_pipeline_test(self, pipe, examples):
+        outputs = pipe(examples[0].get("images"), text=examples[0].get("text"))
+        self.assertEqual(
+            outputs,
+            [
+                {"input_text": ANY(str), "generated_text": ANY(str)},
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt_token_text_only(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        text = "What is the capital of France? Assistant:"
+
+        outputs = pipe(text=text)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "What is the capital of France? Assistant:",
+                    "generated_text": "What is the capital of France? Assistant: The capital of France is Paris.",
+                }
+            ],
+        )
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Write a poem on Hugging Face, the company"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is the capital of France?"},
+                    ],
+                },
+            ],
+        ]
+        outputs = pipe(text=messages)
+        EXPECTED_CONTENT = Expectations(
+            {
+                (
+                    "cuda",
+                    8,
+                ): "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and",
+                (
+                    "rocm",
+                    (9, 4),
+                ): "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they do it all\n\nThey help us to create and share\nContent that is both true and true\nAnd make the world a better place\nWith their tools and services, we can do it all\n\nFrom image and video to text and speech\nThey make it all possible\nWith their tools and services, we can do it all\nAnd make the world a better place\n\nSo let us embrace and use\nHugging Face's tools and services\nTo create and share\nContent that is true and true\nAnd make the world a better place.",
+                (
+                    "xpu",
+                    3,
+                ): "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they do it all\n\nThey help us to create and share\nContent that's both engaging and informative\nWith their tools, we can write\nAnd create stories that are both true and true\n\nThey help us to analyze\nAnd make sense of data that's hard to see\nWith their tools, we can see\nAnd make sense of the world around us\n\nThey help us to create\nAnd share content that's both true and true\nWith their tools, we can see\nAnd make sense of the world around us\n\nSo here's to Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they do it all\n\nThank you, Hugging Face, for all you do\nWith tools and services that make our lives easier\nSo here's to you, and all the great things you do",
+            }
+        ).get_expectation()
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {
+                        "input_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                            }
+                        ],
+                        "generated_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                            },
+                            {
+                                "role": "assistant",
+                                "content": EXPECTED_CONTENT,
+                            },
+                        ],
+                    }
+                ],
+                [
+                    {
+                        "input_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "What is the capital of France?"}],
+                            }
+                        ],
+                        "generated_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "What is the capital of France?"}],
+                            },
+                            {"role": "assistant", "content": "Paris"},
+                        ],
+                    }
+                ],
+            ],
+        )
+
+    @require_torch
+    @require_deterministic_for_xpu
+    def test_small_model_pt_token(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        text = "<image> What this is? Assistant: This is"
+
+        outputs = pipe(image, text=text)
+        EXPECTED_CONTENT = Expectations(
+            {
+                (
+                    "cuda",
+                    8,
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable. The photo captures a moment of tranquility and companionship between the two feline friends.",
+                (
+                    "rocm",
+                    (9, 4),
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a couch, and the cats are positioned in such a way that they are facing the camera. The image captures a peaceful moment between the two cats, and it's a great way to showcase their cuteness and relaxed demeanor.",
+                (
+                    "xpu",
+                    3,
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a surface that looks like a couch or a chair, and it is covered with a soft fabric. The cats' fur is a mix of black, white, and brown, and they have a variety of patterns on their bodies. The image captures a moment of tranquility and companionship between the cats.",
+            }
+        ).get_expectation()
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": EXPECTED_CONTENT,
+                }
+            ],
+        )
+
+        outputs = pipe([image, image], text=[text, text])
+        EXPECTED_CONTENT = Expectations(
+            {
+                (
+                    "cuda",
+                    8,
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a couch, and the cats are positioned in such a way that they are facing the camera. The image captures a peaceful moment between the two cats, and it's a great way to showcase their cuteness and relaxed demeanor.",
+                (
+                    "rocm",
+                    (9, 4),
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a couch, and the overall setting is cozy and comfortable.",
+                (
+                    "xpu",
+                    3,
+                ): "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a surface that looks like a couch or a chair, and it is covered with a soft fabric. The cats' fur is a mix of black, white, and brown, and they have a variety of patterns on their bodies. The image captures a moment of tranquility and companionship between the cats.",
+            }
+        ).get_expectation()
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": EXPECTED_CONTENT,
+                },
+                {
+                    "input_text": "<image> What this is? Assistant: This is",
+                    "generated_text": EXPECTED_CONTENT,
+                },
+            ],
+        )
+
+    @require_torch
+    def test_consistent_batching_behaviour(self):
+        pipe = pipeline("image-text-to-text", model="microsoft/kosmos-2-patch14-224")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        prompt = "a photo of"
+
+        outputs = pipe([image, image], text=[prompt, prompt], max_new_tokens=10)
+        outputs_batched = pipe([image, image], text=[prompt, prompt], batch_size=2, max_new_tokens=10)
+        self.assertEqual(outputs, outputs_batched)
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_with_response_parsing(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's the difference between these two images?"},
+                    {
+                        "type": "image",
+                        "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                    },
+                    {
+                        "type": "image",
+                        "url": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                    },
+                ],
+            }
+        ]
+        pipe.tokenizer.response_schema = {
+            # A real response schema should probably have things like "role" and "content"
+            # and "reasoning_content" but it's unlikely we'd get a tiny model to reliably
+            # output anything like that, so let's keep it simple.
+            "type": "object",
+            "properties": {
+                "first_word": {"type": "string", "x-regex": r"^\s*([a-zA-Z]+)"},
+                "last_word": {"type": "string", "x-regex": r"([a-zA-Z]+)\s*$"},
+            },
+        }
+        outputs = pipe(text=messages, do_sample=False, max_new_tokens=10)
+        parsed_message = outputs[0]["generated_text"][-1]
+        # The parsed message should be a dict with the schema keys, not {"role": "assistant", "content": ...}
+        self.assertIn("first_word", parsed_message)
+        self.assertIn("last_word", parsed_message)
+        self.assertNotIn("role", parsed_message)
+        self.assertIsInstance(parsed_message["first_word"], str)
+        self.assertIsInstance(parsed_message["last_word"], str)
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        image_ny = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        image_chicago = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between these two images?"},
+                    {"type": "image"},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        # Deprecated behavior should raise an error after v5
+        with self.assertRaises(ValueError):
+            outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=True, max_new_tokens=10)
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s the difference between these two images?"},
+                    {"type": "image", "url": image_ny},
+                    {"type": "image", "url": image_chicago},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=True, max_new_tokens=10)
+        EXPECTED_CONTENT = Expectations(
+            {
+                ("rocm", (9, 4)): "The first image shows a statue of the Statue of",
+                ("cuda", 8): "The first image shows a statue of Liberty in the",
+                ("xpu", 3): "The first image shows a statue of Liberty in the",
+            }
+        ).get_expectation()
+
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {
+                                    "type": "image",
+                                    "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                                },
+                                {
+                                    "type": "image",
+                                    "url": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                                },
+                            ],
+                        }
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {
+                                    "type": "image",
+                                    "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                                },
+                                {
+                                    "type": "image",
+                                    "url": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                                },
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": EXPECTED_CONTENT,
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_continue_final_message(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": "There is a dog and"},
+                ],
+            },
+        ]
+        outputs = pipe(text=messages, max_new_tokens=10)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {"role": "assistant", "content": [{"type": "text", "text": "There is a dog and"}]},
+                    ],
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "There is a dog and a person in the image. The dog is sitting",
+                                }
+                            ],
+                        },
+                    ],
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_new_text(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image",
+                                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                                },
+                                {"type": "text", "text": "Describe this image."},
+                            ],
+                        }
+                    ],
+                    "generated_text": "In the image, a woman is sitting on the",
+                }
+            ],
+        )
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_image_url(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image in one sentence."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
+        self.assertEqual(outputs, "A statue of liberty in the foreground of a city")
+
+    @slow
+    @require_torch
+    def test_model_pt_chat_template_image_url_base64(self):
+        with open("./tests/fixtures/tests_samples/COCO/000000039769.png", "rb") as image_file:
+            base64_image = base64.b64encode(image_file.read()).decode("utf-8")
+
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                    },
+                    {"type": "text", "text": "Describe this image in one sentence."},
+                ],
+            }
+        ]
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
+        self.assertEqual(outputs, "Two cats are sleeping on a pink blanket, with")
--- a/tests/pipelines/test_pipelines_keypoint_matching.py
+++ b/tests/pipelines/test_pipelines_keypoint_matching.py
@@ -0,0 +1,193 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import datasets
+
+from transformers.models.auto.modeling_auto import MODEL_FOR_KEYPOINT_MATCHING_MAPPING
+from transformers.pipelines import KeypointMatchingPipeline, pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_vision_available,
+    require_torch,
+    require_vision,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+@is_pipeline_test
+@require_torch
+@require_vision
+class KeypointMatchingPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_KEYPOINT_MATCHING_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            cls._dataset = datasets.load_dataset("hf-internal-testing/image-matching-dataset", split="train")
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        torch_dtype="float32",
+    ):
+        image_matcher = KeypointMatchingPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            torch_dtype=torch_dtype,
+        )
+        examples = [
+            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+        ]
+        return image_matcher, examples
+
+    def run_pipeline_test(self, image_matcher, examples):
+        self._load_dataset()
+        outputs = image_matcher(
+            [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ]
+        )
+
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "keypoint_image_0": {"x": ANY(float), "y": ANY(float)},
+                    "keypoint_image_1": {"x": ANY(float), "y": ANY(float)},
+                    "score": ANY(float),
+                }
+            ]
+            * 2,  # 2 matches per image pair
+        )
+
+        # Accepts URL + PIL.Image + lists
+        outputs = image_matcher(
+            [
+                [
+                    Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                    "http://images.cocodataset.org/val2017/000000039769.jpg",
+                ],
+                [self._dataset[0]["image"], self._dataset[1]["image"]],
+                [self._dataset[1]["image"], self._dataset[2]["image"]],
+                [self._dataset[2]["image"], self._dataset[0]["image"]],
+            ]
+        )
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {
+                        "keypoint_image_0": {"x": ANY(float), "y": ANY(float)},
+                        "keypoint_image_1": {"x": ANY(float), "y": ANY(float)},
+                        "score": ANY(float),
+                    }
+                ]
+                * 2  # 2 matches per image pair
+            ]
+            * 4,  # 4 image pairs
+        )
+
+    @require_torch
+    def test_single_image(self):
+        self._load_dataset()
+        small_model = "magic-leap-community/superglue_outdoor"
+        image_matcher = pipeline("keypoint-matching", model=small_model)
+
+        with self.assertRaises(ValueError):
+            image_matcher(
+                self._dataset[0]["image"],
+                threshold=0.0,
+            )
+        with self.assertRaises(ValueError):
+            image_matcher(
+                [self._dataset[0]["image"]],
+                threshold=0.0,
+            )
+
+    @require_torch
+    def test_single_pair(self):
+        self._load_dataset()
+        small_model = "magic-leap-community/superglue_outdoor"
+        image_matcher = pipeline("keypoint-matching", model=small_model)
+
+        image_0: Image.Image = self._dataset[0]["image"]
+        image_1: Image.Image = self._dataset[1]["image"]
+        outputs = image_matcher((image_0, image_1), threshold=0.0)
+
+        output = outputs[0]  # first match from image pair
+        self.assertAlmostEqual(output["keypoint_image_0"]["x"], 698, places=1)
+        self.assertAlmostEqual(output["keypoint_image_0"]["y"], 469, places=1)
+        self.assertAlmostEqual(output["keypoint_image_1"]["x"], 434, places=1)
+        self.assertAlmostEqual(output["keypoint_image_1"]["y"], 440, places=1)
+        self.assertAlmostEqual(output["score"], 0.9905, places=3)
+
+    @require_torch
+    def test_multiple_pairs(self):
+        self._load_dataset()
+        small_model = "magic-leap-community/superglue_outdoor"
+        image_matcher = pipeline("keypoint-matching", model=small_model)
+
+        image_0: Image.Image = self._dataset[0]["image"]
+        image_1: Image.Image = self._dataset[1]["image"]
+        image_2: Image.Image = self._dataset[2]["image"]
+
+        outputs = image_matcher(
+            [
+                (image_0, image_1),
+                (image_1, image_2),
+                (image_2, image_0),
+            ],
+            threshold=1e-4,
+        )
+
+        # Test first pair (image_0, image_1)
+        output_0 = outputs[0][0]  # First match from first pair
+        self.assertAlmostEqual(output_0["keypoint_image_0"]["x"], 698, places=1)
+        self.assertAlmostEqual(output_0["keypoint_image_0"]["y"], 469, places=1)
+        self.assertAlmostEqual(output_0["keypoint_image_1"]["x"], 434, places=1)
+        self.assertAlmostEqual(output_0["keypoint_image_1"]["y"], 440, places=1)
+        self.assertAlmostEqual(output_0["score"], 0.9905, places=3)
+
+        # Test second pair (image_1, image_2)
+        output_1 = outputs[1][0]  # First match from second pair
+        self.assertAlmostEqual(output_1["keypoint_image_0"]["x"], 272, places=1)
+        self.assertAlmostEqual(output_1["keypoint_image_0"]["y"], 310, places=1)
+        self.assertAlmostEqual(output_1["keypoint_image_1"]["x"], 228, places=1)
+        self.assertAlmostEqual(output_1["keypoint_image_1"]["y"], 568, places=1)
+        self.assertAlmostEqual(output_1["score"], 0.9890, places=3)
+
+        # Test third pair (image_2, image_0)
+        output_2 = outputs[2][0]  # First match from third pair
+        self.assertAlmostEqual(output_2["keypoint_image_0"]["x"], 385, places=1)
+        self.assertAlmostEqual(output_2["keypoint_image_0"]["y"], 677, places=1)
+        self.assertAlmostEqual(output_2["keypoint_image_1"]["x"], 689, places=1)
+        self.assertAlmostEqual(output_2["keypoint_image_1"]["y"], 351, places=1)
+        self.assertAlmostEqual(output_2["score"], 0.9900, places=3)
--- a/tests/pipelines/test_pipelines_mask_generation.py
+++ b/tests/pipelines/test_pipelines_mask_generation.py
@@ -0,0 +1,186 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from huggingface_hub.utils import insecure_hashlib
+
+from transformers import (
+    MODEL_FOR_MASK_GENERATION_MAPPING,
+    is_torch_available,
+    is_vision_available,
+    pipeline,
+)
+from transformers.pipelines import MaskGenerationPipeline
+from transformers.testing_utils import (
+    Expectations,
+    is_pipeline_test,
+    nested_simplify,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+
+if is_torch_available():
+    from transformers import MODEL_FOR_MASK_GENERATION_MAPPING
+else:
+    MODEL_FOR_MASK_GENERATION_MAPPING = None
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+def hashimage(image: Image) -> str:
+    m = insecure_hashlib.md5(image.tobytes())
+    return m.hexdigest()[:10]
+
+
+def mask_to_test_readable(mask: Image) -> dict:
+    npimg = np.array(mask)
+    shape = npimg.shape
+    return {"hash": hashimage(mask), "shape": shape}
+
+
+@is_pipeline_test
+@require_vision
+@require_torch
+class MaskGenerationPipelineTests(unittest.TestCase):
+    model_mapping = dict(list(MODEL_FOR_MASK_GENERATION_MAPPING.items()) if MODEL_FOR_MASK_GENERATION_MAPPING else [])
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        image_segmenter = MaskGenerationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+        return image_segmenter, [
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+
+    @unittest.skip(reason="TODO @Arthur: Implement me")
+    def run_pipeline_test(self, mask_generator, examples):
+        pass
+
+    def test_preprocess_is_last(self):
+        mask_generator = pipeline("mask-generation", model="hf-internal-testing/tiny-random-SamModel")
+        mask_generator.image_processor.pad_size = {"height": 24, "width": 24}
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        for points_per_batch in (100, 64):
+            with self.subTest(points_per_batch=points_per_batch):
+                batches = list(mask_generator.preprocess(image, points_per_batch=points_per_batch))
+                self.assertTrue(batches[-1]["is_last"])
+                self.assertFalse(any(b["is_last"] for b in batches[:-1]))
+
+    @slow
+    @require_torch
+    def test_small_model_pt(self):
+        image_segmenter = pipeline("mask-generation", model="facebook/sam-vit-huge")
+
+        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", points_per_batch=256)
+
+        # Shortening by hashing
+        new_output = []
+        for i, o in enumerate(outputs["masks"]):
+            new_output += [{"mask": mask_to_test_readable(o), "scores": outputs["scores"][i]}]
+
+        # fmt: off
+        last_output = Expectations({
+            ("xpu", None): {'mask': {'hash': 'b5f47c9191', 'shape': (480, 640)}, 'scores': 0.8872},
+            ("cuda", None): {'mask': {'hash': 'b5f47c9191', 'shape': (480, 640)}, 'scores': 0.8871},
+            ("rocm", (9, 5)): {'mask': {'hash': 'b5f47c9191', 'shape': (480, 640)}, 'scores': 0.8872}
+        }).get_expectation()
+
+        self.assertEqual(
+            nested_simplify(new_output, decimals=4),
+            [
+                {'mask': {'hash': '115ad19f5f', 'shape': (480, 640)}, 'scores': 1.0444},
+                {'mask': {'hash': '6affa964c6', 'shape': (480, 640)}, 'scores': 1.021},
+                {'mask': {'hash': 'dfe28a0388', 'shape': (480, 640)}, 'scores': 1.0167},
+                {'mask': {'hash': 'c0a5f4a318', 'shape': (480, 640)}, 'scores': 1.0132},
+                {'mask': {'hash': 'fe8065c197', 'shape': (480, 640)}, 'scores': 1.0053},
+                {'mask': {'hash': 'e2d0b7a0b7', 'shape': (480, 640)}, 'scores': 0.9967},
+                {'mask': {'hash': '453c7844bd', 'shape': (480, 640)}, 'scores': 0.993},
+                {'mask': {'hash': '3d44f2926d', 'shape': (480, 640)}, 'scores': 0.9909},
+                {'mask': {'hash': '64033ddc3f', 'shape': (480, 640)}, 'scores': 0.9879},
+                {'mask': {'hash': '801064ff79', 'shape': (480, 640)}, 'scores': 0.9834},
+                {'mask': {'hash': '6172f276ef', 'shape': (480, 640)}, 'scores': 0.9716},
+                {'mask': {'hash': 'b49e60e084', 'shape': (480, 640)}, 'scores': 0.9612},
+                {'mask': {'hash': 'a811e775fd', 'shape': (480, 640)}, 'scores': 0.9599},
+                {'mask': {'hash': 'a6a8ebcf4b', 'shape': (480, 640)}, 'scores': 0.9552},
+                {'mask': {'hash': '9d8257e080', 'shape': (480, 640)}, 'scores': 0.9532},
+                {'mask': {'hash': '32de6454a8', 'shape': (480, 640)}, 'scores': 0.9516},
+                {'mask': {'hash': 'af3d4af2c8', 'shape': (480, 640)}, 'scores': 0.9499},
+                {'mask': {'hash': '3c6db475fb', 'shape': (480, 640)}, 'scores': 0.9483},
+                {'mask': {'hash': 'c290813fb9', 'shape': (480, 640)}, 'scores': 0.9464},
+                {'mask': {'hash': 'b6f0b8f606', 'shape': (480, 640)}, 'scores': 0.943},
+                {'mask': {'hash': '92ce16bfdf', 'shape': (480, 640)}, 'scores': 0.943},
+                {'mask': {'hash': 'c749b25868', 'shape': (480, 640)}, 'scores': 0.9408},
+                {'mask': {'hash': 'efb6cab859', 'shape': (480, 640)}, 'scores': 0.9335},
+                {'mask': {'hash': '1ff2eafb30', 'shape': (480, 640)}, 'scores': 0.9326},
+                {'mask': {'hash': '788b798e24', 'shape': (480, 640)}, 'scores': 0.9262},
+                {'mask': {'hash': 'abea804f0e', 'shape': (480, 640)}, 'scores': 0.8999},
+                {'mask': {'hash': '7b9e8ddb73', 'shape': (480, 640)}, 'scores': 0.8986},
+                {'mask': {'hash': 'cd24047c8a', 'shape': (480, 640)}, 'scores': 0.8984},
+                {'mask': {'hash': '6943e6bcbd', 'shape': (480, 640)}, 'scores': 0.8873},
+                last_output
+            ],
+        )
+        # fmt: on
+
+    @require_torch
+    @slow
+    def test_threshold(self):
+        model_id = "facebook/sam-vit-huge"
+        image_segmenter = pipeline("mask-generation", model=model_id)
+
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg", pred_iou_thresh=1, points_per_batch=256
+        )
+
+        # Shortening by hashing
+        new_output = []
+        for i, o in enumerate(outputs["masks"]):
+            new_output += [{"mask": mask_to_test_readable(o), "scores": outputs["scores"][i]}]
+
+        self.assertEqual(
+            nested_simplify(new_output, decimals=4),
+            [
+                {"mask": {"hash": "115ad19f5f", "shape": (480, 640)}, "scores": 1.0444},
+                {"mask": {"hash": "6affa964c6", "shape": (480, 640)}, "scores": 1.0210},
+                {"mask": {"hash": "dfe28a0388", "shape": (480, 640)}, "scores": 1.0167},
+                {"mask": {"hash": "c0a5f4a318", "shape": (480, 640)}, "scores": 1.0132},
+                {"mask": {"hash": "fe8065c197", "shape": (480, 640)}, "scores": 1.0053},
+            ],
+        )
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -0,0 +1,297 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import datasets
+from huggingface_hub import ObjectDetectionOutputElement
+
+from transformers import (
+    MODEL_FOR_OBJECT_DETECTION_MAPPING,
+    AutoImageProcessor,
+    AutoModelForObjectDetection,
+    ObjectDetectionPipeline,
+    is_vision_available,
+    pipeline,
+)
+from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
+    is_pipeline_test,
+    nested_simplify,
+    require_pytesseract,
+    require_timm,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+@require_timm
+@require_torch
+class ObjectDetectionPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            # we use revision="refs/pr/1" until the PR is merged
+            # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
+            )
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        object_detector = ObjectDetectionPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+        return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
+
+    def run_pipeline_test(self, object_detector, examples):
+        self._load_dataset()
+        outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
+
+        self.assertGreater(len(outputs), 0)
+        for detected_object in outputs:
+            self.assertEqual(
+                detected_object,
+                {
+                    "score": ANY(float),
+                    "label": ANY(str),
+                    "box": {"xmin": ANY(int), "ymin": ANY(int), "xmax": ANY(int), "ymax": ANY(int)},
+                },
+            )
+
+        batch = [
+            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            # RGBA
+            self._dataset[0]["image"],
+            # LA
+            self._dataset[1]["image"],
+            # L
+            self._dataset[2]["image"],
+        ]
+        batch_outputs = object_detector(batch, threshold=0.0)
+
+        self.assertEqual(len(batch), len(batch_outputs))
+        for outputs in batch_outputs:
+            self.assertGreater(len(outputs), 0)
+            for detected_object in outputs:
+                self.assertEqual(
+                    detected_object,
+                    {
+                        "score": ANY(float),
+                        "label": ANY(str),
+                        "box": {"xmin": ANY(int), "ymin": ANY(int), "xmax": ANY(int), "ymax": ANY(int)},
+                    },
+                )
+                compare_pipeline_output_to_hub_spec(detected_object, ObjectDetectionOutputElement)
+
+    @require_torch
+    def test_small_model_pt(self):
+        model_id = "hf-internal-testing/tiny-detr-mobilenetsv3"
+
+        model = AutoModelForObjectDetection.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+        object_detector = ObjectDetectionPipeline(model=model, image_processor=image_processor)
+
+        outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.3376, "label": "LABEL_0", "box": {"xmin": 159, "ymin": 120, "xmax": 480, "ymax": 359}},
+                {"score": 0.3376, "label": "LABEL_0", "box": {"xmin": 159, "ymin": 120, "xmax": 480, "ymax": 359}},
+            ],
+        )
+
+        outputs = object_detector(
+            [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ],
+            threshold=0.0,
+        )
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.3376, "label": "LABEL_0", "box": {"xmin": 159, "ymin": 120, "xmax": 480, "ymax": 359}},
+                    {"score": 0.3376, "label": "LABEL_0", "box": {"xmin": 159, "ymin": 120, "xmax": 480, "ymax": 359}},
+                ],
+                [
+                    {"score": 0.3376, "label": "LABEL_0", "box": {"xmin": 159, "ymin": 120, "xmax": 480, "ymax": 359}},
+                    {"score": 0.3376, "label": "LABEL_0", "box": {"xmin": 159, "ymin": 120, "xmax": 480, "ymax": 359}},
+                ],
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_large_model_pt(self):
+        model_id = "facebook/detr-resnet-50"
+
+        model = AutoModelForObjectDetection.from_pretrained(model_id)
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+        object_detector = ObjectDetectionPipeline(model=model, image_processor=image_processor)
+
+        outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}},
+                {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}},
+                {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}},
+                {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}},
+                {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}},
+            ],
+        )
+
+        outputs = object_detector(
+            [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ]
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}},
+                    {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}},
+                    {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}},
+                    {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}},
+                    {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}},
+                ],
+                [
+                    {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}},
+                    {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}},
+                    {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}},
+                    {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}},
+                    {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}},
+                ],
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_integration_torch_object_detection(self):
+        model_id = "facebook/detr-resnet-50"
+
+        object_detector = pipeline("object-detection", model=model_id)
+
+        outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg")
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}},
+                {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}},
+                {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}},
+                {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}},
+                {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}},
+            ],
+        )
+
+        outputs = object_detector(
+            [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ]
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}},
+                    {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}},
+                    {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}},
+                    {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}},
+                    {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}},
+                ],
+                [
+                    {"score": 0.9982, "label": "remote", "box": {"xmin": 40, "ymin": 70, "xmax": 175, "ymax": 117}},
+                    {"score": 0.9960, "label": "remote", "box": {"xmin": 333, "ymin": 72, "xmax": 368, "ymax": 187}},
+                    {"score": 0.9955, "label": "couch", "box": {"xmin": 0, "ymin": 1, "xmax": 639, "ymax": 473}},
+                    {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}},
+                    {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}},
+                ],
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_threshold(self):
+        threshold = 0.9985
+        model_id = "facebook/detr-resnet-50"
+
+        object_detector = pipeline("object-detection", model=model_id)
+
+        outputs = object_detector("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9988, "label": "cat", "box": {"xmin": 13, "ymin": 52, "xmax": 314, "ymax": 470}},
+                {"score": 0.9987, "label": "cat", "box": {"xmin": 345, "ymin": 23, "xmax": 640, "ymax": 368}},
+            ],
+        )
+
+    @require_torch
+    @require_pytesseract
+    @slow
+    def test_layoutlm(self):
+        model_id = "Narsil/layoutlmv3-finetuned-funsd"
+        threshold = 0.9993
+
+        object_detector = pipeline("object-detection", model=model_id, threshold=threshold)
+
+        outputs = object_detector(
+            "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png"
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9993, "label": "I-ANSWER", "box": {"xmin": 294, "ymin": 254, "xmax": 343, "ymax": 264}},
+                {"score": 0.9993, "label": "I-ANSWER", "box": {"xmin": 294, "ymin": 254, "xmax": 343, "ymax": 264}},
+            ],
+        )
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
--- a/tests/pipelines/test_pipelines_table_question_answering.py
+++ b/tests/pipelines/test_pipelines_table_question_answering.py
@@ -0,0 +1,373 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import (
+    MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+    AutoModelForTableQuestionAnswering,
+    AutoTokenizer,
+    TableQuestionAnsweringPipeline,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_torch,
+    slow,
+)
+
+
+@is_pipeline_test
+class TQAPipelineTests(unittest.TestCase):
+    # Putting it there for consistency, but TQA do not have fast tokenizer
+    # which are needed to generate automatic tests
+    model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
+
+    @require_torch
+    def test_small_model_pt(self, dtype="float32"):
+        model_id = "lysandre/tiny-tapas-random-wtq"
+        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id, dtype=dtype)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.assertIsInstance(model.config.aggregation_labels, dict)
+        self.assertIsInstance(model.config.no_aggregation_label_index, int)
+
+        table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
+        outputs = table_querier(
+            table={
+                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+                "age": ["56", "45", "59"],
+                "number of movies": ["87", "53", "69"],
+                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+            },
+            query="how many movies has george clooney played in?",
+        )
+        self.assertEqual(
+            outputs,
+            {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+        )
+        outputs = table_querier(
+            table={
+                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+                "age": ["56", "45", "59"],
+                "number of movies": ["87", "53", "69"],
+                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+            },
+            query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
+        )
+        self.assertEqual(
+            outputs,
+            [
+                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+            ],
+        )
+        outputs = table_querier(
+            table={
+                "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                "Stars": ["36542", "4512", "3934"],
+                "Contributors": ["651", "77", "34"],
+                "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+            },
+            query=[
+                "What repository has the largest number of stars?",
+                "Given that the numbers of stars defines if a repository is active, what repository is the most"
+                " active?",
+                "What is the number of repositories?",
+                "What is the average number of stars?",
+                "What is the total amount of stars?",
+            ],
+        )
+        self.assertEqual(
+            outputs,
+            [
+                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+            ],
+        )
+
+        with self.assertRaises(ValueError):
+            table_querier(query="What does it do with empty context ?", table=None)
+        with self.assertRaises(ValueError):
+            table_querier(query="What does it do with empty context ?", table="")
+        with self.assertRaises(ValueError):
+            table_querier(query="What does it do with empty context ?", table={})
+        with self.assertRaises(ValueError):
+            table_querier(
+                table={
+                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                    "Stars": ["36542", "4512", "3934"],
+                    "Contributors": ["651", "77", "34"],
+                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+                }
+            )
+        with self.assertRaises(ValueError):
+            table_querier(
+                query="",
+                table={
+                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                    "Stars": ["36542", "4512", "3934"],
+                    "Contributors": ["651", "77", "34"],
+                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+                },
+            )
+        with self.assertRaises(ValueError):
+            table_querier(
+                query=None,
+                table={
+                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                    "Stars": ["36542", "4512", "3934"],
+                    "Contributors": ["651", "77", "34"],
+                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+                },
+            )
+
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(dtype="float16")
+
+    @require_torch
+    def test_slow_tokenizer_sqa_pt(self, dtype="float32"):
+        model_id = "lysandre/tiny-tapas-random-sqa"
+        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id, dtype=dtype)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
+
+        inputs = {
+            "table": {
+                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+                "age": ["56", "45", "59"],
+                "number of movies": ["87", "53", "69"],
+                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+            },
+            "query": ["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
+        }
+        sequential_outputs = table_querier(**inputs, sequential=True)
+        batch_outputs = table_querier(**inputs, sequential=False)
+
+        self.assertEqual(len(sequential_outputs), 3)
+        self.assertEqual(len(batch_outputs), 3)
+        self.assertEqual(sequential_outputs[0], batch_outputs[0])
+        self.assertNotEqual(sequential_outputs[1], batch_outputs[1])
+        # self.assertNotEqual(sequential_outputs[2], batch_outputs[2])
+
+        table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer, max_new_tokens=20)
+        outputs = table_querier(
+            table={
+                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+                "age": ["56", "45", "59"],
+                "number of movies": ["87", "53", "69"],
+                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+            },
+            query="how many movies has george clooney played in?",
+        )
+        self.assertEqual(
+            outputs,
+            {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
+        )
+        outputs = table_querier(
+            table={
+                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+                "age": ["56", "45", "59"],
+                "number of movies": ["87", "53", "69"],
+                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+            },
+            query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
+        )
+        self.assertEqual(
+            outputs,
+            [
+                {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
+                {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
+                {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
+            ],
+        )
+        outputs = table_querier(
+            table={
+                "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                "Stars": ["36542", "4512", "3934"],
+                "Contributors": ["651", "77", "34"],
+                "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+            },
+            query=[
+                "What repository has the largest number of stars?",
+                "Given that the numbers of stars defines if a repository is active, what repository is the most"
+                " active?",
+                "What is the number of repositories?",
+                "What is the average number of stars?",
+                "What is the total amount of stars?",
+            ],
+        )
+        self.assertEqual(
+            outputs,
+            [
+                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+            ],
+        )
+
+        with self.assertRaises(ValueError):
+            table_querier(query="What does it do with empty context ?", table=None)
+        with self.assertRaises(ValueError):
+            table_querier(query="What does it do with empty context ?", table="")
+        with self.assertRaises(ValueError):
+            table_querier(query="What does it do with empty context ?", table={})
+        with self.assertRaises(ValueError):
+            table_querier(
+                table={
+                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                    "Stars": ["36542", "4512", "3934"],
+                    "Contributors": ["651", "77", "34"],
+                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+                }
+            )
+        with self.assertRaises(ValueError):
+            table_querier(
+                query="",
+                table={
+                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                    "Stars": ["36542", "4512", "3934"],
+                    "Contributors": ["651", "77", "34"],
+                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+                },
+            )
+        with self.assertRaises(ValueError):
+            table_querier(
+                query=None,
+                table={
+                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                    "Stars": ["36542", "4512", "3934"],
+                    "Contributors": ["651", "77", "34"],
+                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+                },
+            )
+
+    @require_torch
+    def test_slow_tokenizer_sqa_pt_fp16(self):
+        self.test_slow_tokenizer_sqa_pt(dtype="float16")
+
+    @slow
+    @require_torch
+    def test_integration_wtq_pt(self, dtype="float32"):
+        table_querier = pipeline("table-question-answering", dtype=dtype)
+
+        data = {
+            "Repository": ["Transformers", "Datasets", "Tokenizers"],
+            "Stars": ["36542", "4512", "3934"],
+            "Contributors": ["651", "77", "34"],
+            "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+        }
+        queries = [
+            "What repository has the largest number of stars?",
+            "Given that the numbers of stars defines if a repository is active, what repository is the most active?",
+            "What is the number of repositories?",
+            "What is the average number of stars?",
+            "What is the total amount of stars?",
+        ]
+
+        results = table_querier(data, queries)
+
+        expected_results = [
+            {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
+            {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
+            {
+                "answer": "COUNT > Transformers, Datasets, Tokenizers",
+                "coordinates": [(0, 0), (1, 0), (2, 0)],
+                "cells": ["Transformers", "Datasets", "Tokenizers"],
+                "aggregator": "COUNT",
+            },
+            {
+                "answer": "AVERAGE > 36542, 4512, 3934",
+                "coordinates": [(0, 1), (1, 1), (2, 1)],
+                "cells": ["36542", "4512", "3934"],
+                "aggregator": "AVERAGE",
+            },
+            {
+                "answer": "SUM > 36542, 4512, 3934",
+                "coordinates": [(0, 1), (1, 1), (2, 1)],
+                "cells": ["36542", "4512", "3934"],
+                "aggregator": "SUM",
+            },
+        ]
+        self.assertListEqual(results, expected_results)
+
+    @slow
+    @require_torch
+    def test_integration_wtq_pt_fp16(self):
+        self.test_integration_wtq_pt(dtype="float16")
+
+    @slow
+    @require_torch
+    def test_integration_sqa_pt(self, dtype="float32"):
+        table_querier = pipeline(
+            "table-question-answering",
+            model="google/tapas-base-finetuned-sqa",
+            tokenizer="google/tapas-base-finetuned-sqa",
+            dtype=dtype,
+        )
+        data = {
+            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+            "Age": ["56", "45", "59"],
+            "Number of movies": ["87", "53", "69"],
+            "Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+        }
+        queries = ["How many movies has George Clooney played in?", "How old is he?", "What's his date of birth?"]
+        results = table_querier(data, queries, sequential=True)
+
+        expected_results = [
+            {"answer": "69", "coordinates": [(2, 2)], "cells": ["69"]},
+            {"answer": "59", "coordinates": [(2, 1)], "cells": ["59"]},
+            {"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]},
+        ]
+        self.assertListEqual(results, expected_results)
+
+    @slow
+    @require_torch
+    def test_integration_sqa_pt_fp16(self):
+        self.test_integration_sqa_pt(dtype="float16")
+
+    @slow
+    @require_torch
+    def test_large_model_pt_tapex(self, dtype="float32"):
+        model_id = "microsoft/tapex-large-finetuned-wtq"
+        table_querier = pipeline(
+            "table-question-answering",
+            model=model_id,
+            dtype=dtype,
+        )
+        data = {
+            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+            "Age": ["56", "45", "59"],
+            "Number of movies": ["87", "53", "69"],
+            "Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+        }
+        queries = [
+            "How many movies has George Clooney played in?",
+            "How old is Mr Clooney ?",
+            "What's the date of birth of Leonardo ?",
+        ]
+        results = table_querier(data, queries, sequential=True)
+
+        expected_results = [
+            {"answer": " 69"},
+            {"answer": " 59"},
+            {"answer": " 10 june 1996"},
+        ]
+        self.assertListEqual(results, expected_results)
--- a/tests/pipelines/test_pipelines_text_classification.py
+++ b/tests/pipelines/test_pipelines_text_classification.py
@@ -0,0 +1,197 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import (
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+    TextClassificationPipeline,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_torch_available,
+    nested_simplify,
+    require_torch,
+    require_torch_bf16,
+    require_torch_fp16,
+    slow,
+    torch_device,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_torch_available():
+    import torch
+
+
+# These 2 model types require different inputs than those of the usual text models.
+_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
+
+
+@is_pipeline_test
+class TextClassificationPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+
+    if not hasattr(model_mapping, "is_dummy"):
+        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
+
+    @require_torch
+    def test_small_model_pt(self):
+        text_classifier = pipeline(task="text-classification", model="hf-internal-testing/tiny-random-distilbert")
+
+        outputs = text_classifier("This is great !")
+        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
+
+        outputs = text_classifier("This is great !", top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}]
+        )
+
+        outputs = text_classifier(["This is great !", "This is bad"], top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs),
+            [
+                [{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}],
+                [{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}],
+            ],
+        )
+
+        outputs = text_classifier("This is great !", top_k=1)
+        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
+
+        # Do not apply any function to output for regression tasks
+        # hack: changing problem_type artificially (so keep this test at last)
+        text_classifier.model.config.problem_type = "regression"
+        outputs = text_classifier("This is great !")
+        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.01}])
+
+    @require_torch
+    def test_accepts_torch_device(self):
+        text_classifier = pipeline(
+            task="text-classification",
+            model="hf-internal-testing/tiny-random-distilbert",
+            device=torch_device,
+        )
+
+        outputs = text_classifier("This is great !")
+        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
+
+    @require_torch_fp16
+    def test_accepts_torch_fp16(self):
+        text_classifier = pipeline(
+            task="text-classification",
+            model="hf-internal-testing/tiny-random-distilbert",
+            device=torch_device,
+            dtype=torch.float16,
+        )
+
+        outputs = text_classifier("This is great !")
+        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
+
+    @require_torch_bf16
+    def test_accepts_torch_bf16(self):
+        text_classifier = pipeline(
+            task="text-classification",
+            model="hf-internal-testing/tiny-random-distilbert",
+            device=torch_device,
+            dtype=torch.bfloat16,
+        )
+
+        outputs = text_classifier("This is great !")
+        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
+
+    @slow
+    @require_torch
+    def test_pt_bert(self):
+        text_classifier = pipeline("text-classification")
+
+        outputs = text_classifier("This is great !")
+        self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 1.0}])
+        outputs = text_classifier("This is bad !")
+        self.assertEqual(nested_simplify(outputs), [{"label": "NEGATIVE", "score": 1.0}])
+        outputs = text_classifier("Birds are a type of animal")
+        self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        text_classifier = TextClassificationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+        )
+        return text_classifier, ["HuggingFace is in", "This is another test"]
+
+    def run_pipeline_test(self, text_classifier, _):
+        model = text_classifier.model
+        # Small inputs because BartTokenizer tiny has maximum position embeddings = 22
+        valid_inputs = "HuggingFace is in"
+        outputs = text_classifier(valid_inputs)
+
+        self.assertEqual(nested_simplify(outputs), [{"label": ANY(str), "score": ANY(float)}])
+        self.assertTrue(outputs[0]["label"] in model.config.id2label.values())
+
+        valid_inputs = ["HuggingFace is in ", "Paris is in France"]
+        outputs = text_classifier(valid_inputs)
+        self.assertEqual(
+            nested_simplify(outputs),
+            [{"label": ANY(str), "score": ANY(float)}, {"label": ANY(str), "score": ANY(float)}],
+        )
+        self.assertTrue(outputs[0]["label"] in model.config.id2label.values())
+        self.assertTrue(outputs[1]["label"] in model.config.id2label.values())
+
+        # Forcing to get all results with `top_k=None`
+        # This is NOT the legacy format
+        outputs = text_classifier(valid_inputs, top_k=None)
+        N = len(model.config.id2label.values())
+        self.assertEqual(
+            nested_simplify(outputs),
+            [[{"label": ANY(str), "score": ANY(float)}] * N, [{"label": ANY(str), "score": ANY(float)}] * N],
+        )
+
+        valid_inputs = {"text": "HuggingFace is in ", "text_pair": "Paris is in France"}
+        outputs = text_classifier(valid_inputs)
+        self.assertEqual(
+            nested_simplify(outputs),
+            {"label": ANY(str), "score": ANY(float)},
+        )
+        self.assertTrue(outputs["label"] in model.config.id2label.values())
+
+        # This might be used a text pair, but tokenizer + pipe interaction
+        # makes it hard to understand that it's not using the pair properly
+        # https://github.com/huggingface/transformers/issues/17305
+        # We disabled this usage instead as it was outputting wrong outputs.
+        invalid_input = [["HuggingFace is in ", "Paris is in France"]]
+        with self.assertRaises(ValueError):
+            text_classifier(invalid_input)
+
+        # This used to be valid for doing text pairs
+        # We're keeping it working because of backward compatibility
+        outputs = text_classifier([[["HuggingFace is in ", "Paris is in France"]]])
+        self.assertEqual(
+            nested_simplify(outputs),
+            [{"label": ANY(str), "score": ANY(float)}],
+        )
+        self.assertTrue(outputs[0]["label"] in model.config.id2label.values())
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -0,0 +1,617 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest.mock import patch
+
+from transformers import (
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    TextGenerationPipeline,
+    logging,
+    pipeline,
+)
+from transformers.testing_utils import (
+    CaptureLogger,
+    is_pipeline_test,
+    require_accelerate,
+    require_torch,
+    require_torch_accelerator,
+    torch_device,
+)
+
+from .test_pipelines_common import ANY
+
+
+@is_pipeline_test
+@require_torch
+class TextGenerationPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
+
+    @require_torch
+    def test_small_model_pt(self):
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-random-LlamaForCausalLM",
+            max_new_tokens=10,
+        )
+        # Using `do_sample=False` to force deterministic output
+        outputs = text_generator("This is a test", do_sample=False)
+        self.assertEqual(outputs, [{"generated_text": "This is a testкт MéxicoWSAnimImportдели pip letscosatur"}])
+
+        outputs = text_generator(["This is a test", "This is a second test"], do_sample=False)
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": "This is a testкт MéxicoWSAnimImportдели pip letscosatur"}],
+                [{"generated_text": "This is a second testкт MéxicoWSAnimImportдели Düsseld bootstrap learn user"}],
+            ],
+        )
+
+        outputs = text_generator("This is a test", do_sample=True, num_return_sequences=2, return_tensors=True)
+        self.assertEqual(
+            outputs,
+            [
+                {"generated_token_ids": ANY(list)},
+                {"generated_token_ids": ANY(list)},
+            ],
+        )
+
+    @require_torch
+    def test_small_chat_model_pt(self):
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-gpt2-with-chatml-template",
+        )
+        # Using `do_sample=False` to force deterministic output
+        chat1 = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+        ]
+        chat2 = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a second test"},
+        ]
+        outputs = text_generator(chat1, do_sample=False, max_new_tokens=10)
+        expected_chat1 = chat1 + [
+            {
+                "role": "assistant",
+                "content": " factors factors factors factors factors factors factors factors factors factors",
+            }
+        ]
+        self.assertEqual(
+            outputs,
+            [
+                {"generated_text": expected_chat1},
+            ],
+        )
+
+        outputs = text_generator([chat1, chat2], do_sample=False, max_new_tokens=10)
+        expected_chat2 = chat2 + [
+            {
+                "role": "assistant",
+                "content": " stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs",
+            }
+        ]
+
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": expected_chat1}],
+                [{"generated_text": expected_chat2}],
+            ],
+        )
+
+    @require_torch
+    def test_small_chat_model_continue_final_message(self):
+        # Here we check that passing a chat that ends in an assistant message is handled correctly
+        # by continuing the final message rather than starting a new one
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-gpt2-with-chatml-template",
+        )
+        # Using `do_sample=False` to force deterministic output
+        chat1 = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+            {"role": "assistant", "content": "This is"},
+        ]
+        outputs = text_generator(chat1, do_sample=False, max_new_tokens=10)
+
+        # Assert that we continued the last message and there isn't a sneaky <|im_end|>
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "generated_text": [
+                        {"role": "system", "content": "This is a system message."},
+                        {"role": "user", "content": "This is a test"},
+                        {
+                            "role": "assistant",
+                            "content": "This is stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs",
+                        },
+                    ]
+                }
+            ],
+        )
+
+    @require_torch
+    def test_small_chat_model_continue_final_message_override(self):
+        # Here we check that passing a chat that ends in an assistant message is handled correctly
+        # by continuing the final message rather than starting a new one
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-gpt2-with-chatml-template",
+        )
+        # Using `do_sample=False` to force deterministic output
+        chat1 = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+        ]
+        outputs = text_generator(chat1, do_sample=False, max_new_tokens=10, continue_final_message=True)
+
+        # Assert that we continued the last message and there isn't a sneaky <|im_end|>
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "generated_text": [
+                        {"role": "system", "content": "This is a system message."},
+                        {
+                            "role": "user",
+                            "content": "This is a test stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs",
+                        },
+                    ]
+                }
+            ],
+        )
+
+    @require_torch
+    def test_small_chat_model_with_dataset_pt(self):
+        from torch.utils.data import Dataset
+
+        from transformers.pipelines.pt_utils import KeyDataset
+
+        class MyDataset(Dataset):
+            data = [
+                [
+                    {"role": "system", "content": "This is a system message."},
+                    {"role": "user", "content": "This is a test"},
+                ],
+            ]
+
+            def __len__(self):
+                return 1
+
+            def __getitem__(self, i):
+                return {"text": self.data[i]}
+
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-gpt2-with-chatml-template",
+        )
+
+        dataset = MyDataset()
+        key_dataset = KeyDataset(dataset, "text")
+
+        for outputs in text_generator(key_dataset, do_sample=False, max_new_tokens=10):
+            expected_chat = dataset.data[0] + [
+                {
+                    "role": "assistant",
+                    "content": " factors factors factors factors factors factors factors factors factors factors",
+                }
+            ]
+            self.assertEqual(
+                outputs,
+                [
+                    {"generated_text": expected_chat},
+                ],
+            )
+
+    @require_torch
+    def test_small_chat_model_with_iterator_pt(self):
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-gpt2-with-chatml-template",
+        )
+
+        # Using `do_sample=False` to force deterministic output
+        chat1 = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+        ]
+        chat2 = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a second test"},
+        ]
+        expected_chat1 = chat1 + [
+            {
+                "role": "assistant",
+                "content": " factors factors factors factors factors factors factors factors factors factors",
+            }
+        ]
+        expected_chat2 = chat2 + [
+            {
+                "role": "assistant",
+                "content": " stairs stairs stairs stairs stairs stairs stairs stairs stairs stairs",
+            }
+        ]
+
+        def data():
+            yield from [chat1, chat2]
+
+        outputs = text_generator(data(), do_sample=False, max_new_tokens=10)
+        outputs = list(outputs)
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": expected_chat1}],
+                [{"generated_text": expected_chat2}],
+            ],
+        )
+
+    @require_torch
+    def test_small_chat_model_with_response_parsing(self):
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-gpt2-with-chatml-template",
+        )
+        # Using `do_sample=False` to force deterministic output
+        chat = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+        ]
+        text_generator.tokenizer.response_schema = {
+            # A real response schema should probably have things like "role" and "content"
+            # and "reasoning_content" but it's unlikely we'd get a tiny model to reliably
+            # output anything like that, so let's keep it simple.
+            "type": "object",
+            "properties": {
+                "first_word": {"type": "string", "x-regex": r"^\s*([a-zA-Z]+)"},
+                "last_word": {"type": "string", "x-regex": r"([a-zA-Z]+)\s*$"},
+            },
+        }
+        outputs = text_generator(chat, do_sample=False, max_new_tokens=10)
+        parsed_message = outputs[0]["generated_text"][-1]
+        self.assertEqual(parsed_message, {"first_word": "factors", "last_word": "factors"})
+
+    @require_torch
+    def test_return_full_text_false_with_chat_template(self):
+        """Regression test for #45854: return_full_text=False must not include prompt when using chat template."""
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-gpt2-with-chatml-template",
+        )
+        chat = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+        ]
+        outputs = text_generator(chat, do_sample=False, max_new_tokens=10, return_full_text=False)
+        generated = outputs[0]["generated_text"]
+
+        # Must return plain string, not a list of message dicts
+        self.assertIsInstance(generated, str)
+        # Must not contain the prompt content
+        self.assertNotIn("This is a test", generated)
+        self.assertNotIn("This is a system message.", generated)
+
+    @require_torch
+    def test_return_full_text_true_with_chat_template(self):
+        """return_full_text=True (default) must still return full chat list with chat template."""
+        text_generator = pipeline(
+            task="text-generation",
+            model="hf-internal-testing/tiny-gpt2-with-chatml-template",
+        )
+        chat = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+        ]
+        outputs = text_generator(chat, do_sample=False, max_new_tokens=10, return_full_text=True)
+        generated = outputs[0]["generated_text"]
+
+        # Must return list of message dicts including original messages
+        self.assertIsInstance(generated, list)
+        roles = [m["role"] for m in generated]
+        self.assertIn("user", roles)
+        self.assertIn("assistant", roles)
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        text_generator = TextGenerationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            dtype=dtype,
+            max_new_tokens=5,
+        )
+        return text_generator, ["This is a test", "Another test"]
+
+    def test_stop_sequence_stopping_criteria(self):
+        prompt = """Hello I believe in"""
+        text_generator = pipeline(
+            "text-generation", model="hf-internal-testing/tiny-random-gpt2", max_new_tokens=5, do_sample=False
+        )
+        output = text_generator(prompt)
+        self.assertEqual(
+            output,
+            [{"generated_text": "Hello I believe in fe fe fe fe fe"}],
+        )
+
+        output = text_generator(prompt, stop_sequence=" fe")
+        self.assertEqual(output, [{"generated_text": "Hello I believe in fe"}])
+
+    def run_pipeline_test(self, text_generator, _):
+        model = text_generator.model
+        tokenizer = text_generator.tokenizer
+
+        outputs = text_generator("This is a test")
+        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
+        self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
+
+        outputs = text_generator("This is a test", return_full_text=False)
+        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
+        self.assertNotIn("This is a test", outputs[0]["generated_text"])
+
+        text_generator = pipeline(
+            task="text-generation", model=model, tokenizer=tokenizer, return_full_text=False, max_new_tokens=5
+        )
+        outputs = text_generator("This is a test")
+        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
+        self.assertNotIn("This is a test", outputs[0]["generated_text"])
+
+        outputs = text_generator("This is a test", return_full_text=True)
+        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
+        self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
+
+        outputs = text_generator(["This is great !", "Something else"], num_return_sequences=2, do_sample=True)
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
+                [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
+            ],
+        )
+
+        if text_generator.tokenizer.pad_token is not None:
+            outputs = text_generator(
+                ["This is great !", "Something else"], num_return_sequences=2, batch_size=2, do_sample=True
+            )
+            self.assertEqual(
+                outputs,
+                [
+                    [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
+                    [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
+                ],
+            )
+
+        with self.assertRaises(ValueError):
+            outputs = text_generator("test", return_full_text=True, return_text=True)
+        with self.assertRaises(ValueError):
+            outputs = text_generator("test", return_full_text=True, return_tensors=True)
+        with self.assertRaises(ValueError):
+            outputs = text_generator("test", return_text=True, return_tensors=True)
+
+        # Empty prompt is slightly special
+        # it requires BOS token to exist.
+        # Special case for Pegasus which will always append EOS so will
+        # work even without BOS.
+        if (
+            text_generator.tokenizer.bos_token_id is not None
+            or "Pegasus" in tokenizer.__class__.__name__
+            or "Git" in model.__class__.__name__
+        ):
+            outputs = text_generator("")
+            self.assertEqual(outputs, [{"generated_text": ANY(str)}])
+        else:
+            with self.assertRaises((ValueError, AssertionError)):
+                outputs = text_generator("", add_special_tokens=False)
+
+        # We don't care about infinite range models.
+        # They already work.
+        # Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
+        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = [
+            "RwkvForCausalLM",
+            "XGLMForCausalLM",
+            "GPTNeoXForCausalLM",
+            "GPTNeoXJapaneseForCausalLM",
+            "FuyuForCausalLM",
+            "LlamaForCausalLM",
+        ]
+        if (
+            tokenizer.model_max_length < 10000
+            and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS
+        ):
+            # Handling of large generations
+            if str(text_generator.device) == "cpu":
+                with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)):
+                    text_generator("This is a test" * 500, max_new_tokens=5)
+
+            outputs = text_generator("This is a test" * 500, handle_long_generation="hole", max_new_tokens=5)
+            # Hole strategy cannot work
+            if str(text_generator.device) == "cpu":
+                with self.assertRaises(ValueError):
+                    text_generator(
+                        "This is a test" * 500,
+                        handle_long_generation="hole",
+                        max_new_tokens=tokenizer.model_max_length + 10,
+                    )
+
+    @require_torch
+    @require_accelerate
+    @require_torch_accelerator
+    def test_small_model_pt_bloom_accelerate(self):
+        import torch
+
+        # Classic `model_kwargs`
+        pipe = pipeline(
+            model="hf-internal-testing/tiny-random-bloom",
+            model_kwargs={"device_map": "auto", "dtype": torch.bfloat16},
+            max_new_tokens=5,
+            do_sample=False,
+        )
+        self.assertEqual(pipe.model.lm_head.weight.dtype, torch.bfloat16)
+        out = pipe("This is a test")
+        self.assertEqual(
+            out,
+            [{"generated_text": ("This is a test test test test test test")}],
+        )
+
+        # Upgraded those two to real pipeline arguments (they just get sent for the model as they're unlikely to mean anything else.)
+        pipe = pipeline(
+            model="hf-internal-testing/tiny-random-bloom",
+            device_map="auto",
+            dtype=torch.bfloat16,
+            max_new_tokens=5,
+            do_sample=False,
+        )
+        self.assertEqual(pipe.model.lm_head.weight.dtype, torch.bfloat16)
+        out = pipe("This is a test")
+        self.assertEqual(
+            out,
+            [{"generated_text": ("This is a test test test test test test")}],
+        )
+
+        # dtype will be automatically set to torch.bfloat16 if not provided - check: https://github.com/huggingface/transformers/pull/38882
+        pipe = pipeline(
+            model="hf-internal-testing/tiny-random-bloom", device_map="auto", max_new_tokens=5, do_sample=False
+        )
+        self.assertEqual(pipe.model.lm_head.weight.dtype, torch.bfloat16)
+        out = pipe("This is a test")
+        self.assertEqual(
+            out,
+            [{"generated_text": ("This is a test test test test test test")}],
+        )
+
+    @require_torch
+    @require_torch_accelerator
+    def test_small_model_fp16(self):
+        import torch
+
+        pipe = pipeline(
+            model="hf-internal-testing/tiny-random-bloom",
+            device=torch_device,
+            dtype=torch.float16,
+            max_new_tokens=3,
+        )
+        pipe("This is a test")
+
+    @require_torch
+    @require_accelerate
+    @require_torch_accelerator
+    def test_pipeline_accelerate_top_p(self):
+        import torch
+
+        pipe = pipeline(
+            model="hf-internal-testing/tiny-random-bloom",
+            device_map=torch_device,
+            dtype=torch.float16,
+            max_new_tokens=3,
+        )
+        pipe("This is a test", do_sample=True, top_p=0.5)
+
+    def test_pipeline_length_setting_warning(self):
+        prompt = """Hello world"""
+        text_generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-gpt2", max_new_tokens=5)
+        logger = logging.get_logger("transformers.generation.utils")
+        logger_msg = "Both `max_new_tokens`"  # The beginning of the message to be checked in this test
+
+        # Both are set by the user -> log warning
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_length=10, max_new_tokens=1)
+        self.assertIn(logger_msg, cl.out)
+
+        # The user only sets one -> no warning
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_new_tokens=1)
+        self.assertNotIn(logger_msg, cl.out)
+
+        with CaptureLogger(logger) as cl:
+            _ = text_generator(prompt, max_length=10, max_new_tokens=None)
+        self.assertNotIn(logger_msg, cl.out)
+
+    def test_return_dict_in_generate(self):
+        text_generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-gpt2", max_new_tokens=2)
+        out = text_generator(
+            ["This is great !", "Something else"], return_dict_in_generate=True, output_logits=True, output_scores=True
+        )
+        self.assertEqual(
+            out,
+            [
+                [
+                    {
+                        "generated_text": ANY(str),
+                        "logits": ANY(list),
+                        "scores": ANY(list),
+                    },
+                ],
+                [
+                    {
+                        "generated_text": ANY(str),
+                        "logits": ANY(list),
+                        "scores": ANY(list),
+                    },
+                ],
+            ],
+        )
+
+    @require_torch
+    def test_pipeline_assisted_generation(self):
+        """Tests that we can run assisted generation in the pipeline"""
+        model = "hf-internal-testing/tiny-random-MistralForCausalLM"
+        pipe = pipeline("text-generation", model=model, assistant_model=model, max_new_tokens=2)
+
+        # We can run the pipeline
+        prompt = "Hello world"
+        _ = pipe(prompt)
+
+        # It is running assisted generation under the hood (e.g. flags incompatible with assisted gen will crash)
+        with self.assertRaises(ValueError):
+            _ = pipe(prompt, generate_kwargs={"num_beams": 2})
+
+    @require_torch
+    def test_pipeline_skip_special_tokens(self):
+        """Tests that we can use `skip_special_tokens=False` to get the special tokens in the output"""
+        model_id = "google/gemma-3-270m-it"
+        chat = [{"role": "user", "content": "What's your name?"}]
+        generator = pipeline("text-generation", model=model_id)
+
+        # normal pipeline use
+        output = generator(chat, max_new_tokens=20, do_sample=False)
+        self.assertNotIn("<end_of_turn>", str(output[0]["generated_text"]))
+
+        # forcing special tokens to be included in the output
+        output = generator(chat, max_new_tokens=1000, do_sample=False, skip_special_tokens=False)
+        self.assertIn("<end_of_turn>", str(output[0]["generated_text"]))
+
+    @require_torch
+    def test_forward_tokenizer_kwargs(self):
+        chat = [
+            {"role": "system", "content": "This is a system message."},
+            {"role": "user", "content": "This is a test"},
+        ]
+        model = "hf-internal-testing/tiny-gpt2-with-chatml-template"
+        text_generator = pipeline("text-generation", model, max_new_tokens=5)
+        tokenizer = text_generator.tokenizer
+
+        with patch.object(tokenizer, "apply_chat_template", wraps=tokenizer.apply_chat_template) as mock:
+            text_generator(chat, tokenizer_encode_kwargs={"enable_thinking": True})
+            self.assertGreater(mock.call_count, 0)
+            kw_call_args = mock.call_args[1]
+            self.assertIn("enable_thinking", kw_call_args)
+            self.assertEqual(kw_call_args["enable_thinking"], True)
--- a/tests/pipelines/test_pipelines_text_to_audio.py
+++ b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -0,0 +1,382 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from transformers import (
+    MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
+    AutoProcessor,
+    TextToAudioPipeline,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_torch,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+from transformers.trainer_utils import set_seed
+
+from .test_pipelines_common import ANY
+
+
+@is_pipeline_test
+@require_torch
+class TextToAudioPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING
+    # for now only test text_to_waveform and not text_to_spectrogram
+
+    @require_torch
+    def test_small_speecht5_pt(self):
+        audio_generator = pipeline(task="text-to-audio", model="microsoft/speecht5_tts")
+        num_channels = 1  # model generates mono audio
+        forward_params = {
+            "do_sample": True,
+            "semantic_max_new_tokens": 5,
+            "speaker_embeddings": torch.rand(1, 512) * 0.2 - 0.1,
+        }
+
+        outputs = audio_generator("This is a test", forward_params=forward_params)
+        self.assertEqual({"audio": ANY(np.ndarray), "sampling_rate": 16000}, outputs)
+        self.assertEqual(len(outputs["audio"].shape), num_channels)
+
+        # test two examples side-by-side
+        outputs = audio_generator(["This is a test", "This is a second test"], forward_params=forward_params)
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+        # test batching, this time with parameterization in the forward pass
+        audio_generator = pipeline(task="text-to-audio", model="microsoft/speecht5_tts")
+        forward_params = {
+            "do_sample": False,
+            "max_new_tokens": 5,
+            "speaker_embeddings": torch.rand(1, 512) * 0.2 - 0.1,
+        }
+        outputs = audio_generator(
+            ["This is a test", "This is a second test"], forward_params=forward_params, batch_size=2
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+    @require_torch
+    def test_small_musicgen_pt(self):
+        music_generator = pipeline(
+            task="text-to-audio", model="facebook/musicgen-small", do_sample=False, max_new_tokens=5
+        )
+        num_channels = 1  # model generates mono audio
+
+        outputs = music_generator("This is a test")
+        self.assertEqual({"audio": ANY(np.ndarray), "sampling_rate": 32000}, outputs)
+        self.assertEqual(len(outputs["audio"].shape), num_channels)
+
+        # test two examples side-by-side
+        outputs = music_generator(["This is a test", "This is a second test"])
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+        # test batching, this time with parameterization in the forward pass
+        music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small")
+        forward_params = {"do_sample": False, "max_new_tokens": 5}
+        outputs = music_generator(
+            ["This is a test", "This is a second test"], forward_params=forward_params, batch_size=2
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+    @slow
+    @require_torch
+    def test_medium_seamless_m4t_pt(self):
+        speech_generator = pipeline(task="text-to-audio", model="facebook/hf-seamless-m4t-medium", max_new_tokens=5)
+
+        for forward_params in [{"tgt_lang": "eng"}, {"return_intermediate_token_ids": True, "tgt_lang": "eng"}]:
+            outputs = speech_generator("This is a test", forward_params=forward_params)
+            self.assertEqual({"audio": ANY(np.ndarray), "sampling_rate": 16000}, outputs)
+
+            # test two examples side-by-side
+            outputs = speech_generator(["This is a test", "This is a second test"], forward_params=forward_params)
+            audio = [output["audio"] for output in outputs]
+            self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+            # test batching
+            outputs = speech_generator(
+                ["This is a test", "This is a second test"], forward_params=forward_params, batch_size=2
+            )
+            audio = [output["audio"] for output in outputs]
+            self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+    @slow
+    @require_torch
+    def test_small_bark_pt(self):
+        speech_generator = pipeline(task="text-to-audio", model="suno/bark-small")
+        num_channels = 1  # model generates mono audio
+
+        forward_params = {
+            # Using `do_sample=False` to force deterministic output
+            "do_sample": False,
+            "semantic_max_new_tokens": 5,
+        }
+
+        outputs = speech_generator("This is a test", forward_params=forward_params)
+        self.assertEqual(
+            {"audio": ANY(np.ndarray), "sampling_rate": 24000},
+            outputs,
+        )
+        self.assertEqual(len(outputs["audio"].shape), num_channels)
+
+        # test two examples side-by-side
+        outputs = speech_generator(
+            ["This is a test", "This is a second test"],
+            forward_params=forward_params,
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+        # test other generation strategy
+        forward_params = {
+            "do_sample": True,
+            "semantic_max_new_tokens": 5,
+            "semantic_num_return_sequences": 2,
+        }
+
+        outputs = speech_generator("This is a test", forward_params=forward_params)
+        audio = outputs["audio"]
+        self.assertEqual(ANY(np.ndarray), audio)
+
+        # test using a speaker embedding
+        processor = AutoProcessor.from_pretrained("suno/bark-small")
+        temp_inp = processor("hey, how are you?", voice_preset="v2/en_speaker_5")
+        history_prompt = temp_inp["history_prompt"]
+        forward_params["history_prompt"] = history_prompt
+
+        outputs = speech_generator(
+            ["This is a test", "This is a second test"],
+            forward_params=forward_params,
+            batch_size=2,
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+    @slow
+    @require_torch_accelerator
+    def test_conversion_additional_tensor(self):
+        speech_generator = pipeline(task="text-to-audio", model="suno/bark-small", device=torch_device)
+        processor = AutoProcessor.from_pretrained("suno/bark-small")
+
+        forward_params = {
+            "do_sample": True,
+            "semantic_max_new_tokens": 5,
+        }
+
+        # atm, must do to stay coherent with BarkProcessor
+        preprocess_params = {
+            "max_length": 256,
+            "add_special_tokens": False,
+            "return_attention_mask": True,
+            "return_token_type_ids": False,
+        }
+        outputs = speech_generator(
+            "This is a test",
+            forward_params=forward_params,
+            preprocess_params=preprocess_params,
+        )
+
+        temp_inp = processor("hey, how are you?", voice_preset="v2/en_speaker_5")
+        history_prompt = temp_inp["history_prompt"]
+        forward_params["history_prompt"] = history_prompt
+
+        # history_prompt is a torch.Tensor passed as a forward_param
+        # if generation is successful, it means that it was passed to the right device
+        outputs = speech_generator(
+            "This is a test", forward_params=forward_params, preprocess_params=preprocess_params
+        )
+        self.assertEqual(
+            {"audio": ANY(np.ndarray), "sampling_rate": 24000},
+            outputs,
+        )
+
+    @require_torch
+    def test_vits_model_pt(self):
+        speech_generator = pipeline(task="text-to-audio", model="facebook/mms-tts-eng")
+
+        outputs = speech_generator("This is a test")
+        self.assertEqual(outputs["sampling_rate"], 16000)
+
+        audio = outputs["audio"]
+        self.assertEqual(ANY(np.ndarray), audio)
+
+        # test two examples side-by-side
+        outputs = speech_generator(["This is a test", "This is a second test"])
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+
+        # test batching
+        outputs = speech_generator(["This is a test", "This is a second test"], batch_size=2)
+        self.assertEqual(ANY(np.ndarray), outputs[0]["audio"])
+
+    @require_torch
+    def test_forward_model_kwargs(self):
+        # use vits - a forward model
+        speech_generator = pipeline(task="text-to-audio", model="kakao-enterprise/vits-vctk")
+
+        # for reproducibility
+        set_seed(555)
+        outputs = speech_generator("This is a test", forward_params={"speaker_id": 5})
+        audio = outputs["audio"]
+
+        with self.assertRaises(TypeError):
+            # assert error if generate parameter
+            outputs = speech_generator("This is a test", forward_params={"speaker_id": 5, "do_sample": True})
+
+        forward_params = {"speaker_id": 5}
+        generate_kwargs = {"do_sample": True}
+
+        with self.assertRaises(ValueError):
+            # assert error if generate_kwargs with forward-only models
+            outputs = speech_generator(
+                "This is a test", forward_params=forward_params, generate_kwargs=generate_kwargs
+            )
+        self.assertTrue(np.abs(outputs["audio"] - audio).max() < 1e-5)
+
+    @require_torch
+    def test_generative_model_kwargs(self):
+        # use musicgen - a generative model
+        music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small")
+
+        forward_params = {
+            "do_sample": True,
+            "max_new_tokens": 20,
+        }
+
+        # for reproducibility
+        set_seed(555)
+        outputs = music_generator("This is a test", forward_params=forward_params)
+        audio = outputs["audio"]
+        self.assertEqual(ANY(np.ndarray), audio)
+
+        # make sure generate kwargs get priority over forward params
+        forward_params = {
+            "do_sample": False,
+            "max_new_tokens": 20,
+        }
+        generate_kwargs = {"do_sample": True}
+
+        # for reproducibility
+        set_seed(555)
+        outputs = music_generator("This is a test", forward_params=forward_params, generate_kwargs=generate_kwargs)
+        self.assertTrue(np.abs(outputs["audio"] - audio).max() < 1e-5)
+
+    @slow
+    @require_torch
+    def test_csm_model_pt(self):
+        speech_generator = pipeline(task="text-to-audio", model="sesame/csm-1b", device=torch_device)
+        generate_kwargs = {"max_new_tokens": 10}
+        num_channels = 1  # model generates mono audio
+
+        outputs = speech_generator("This is a test", generate_kwargs=generate_kwargs)
+        self.assertEqual(outputs["sampling_rate"], 24000)
+        audio = outputs["audio"]
+        self.assertEqual(ANY(np.ndarray), audio)
+        # ensure audio and not discrete codes
+        self.assertEqual(len(audio.shape), num_channels)
+
+        # test two examples side-by-side
+        outputs = speech_generator(["This is a test", "This is a second test"], generate_kwargs=generate_kwargs)
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+        self.assertEqual(len(audio[0].shape), num_channels)
+
+        # test batching
+        batch_size = 2
+        outputs = speech_generator(
+            ["This is a test", "This is a second test"], generate_kwargs=generate_kwargs, batch_size=batch_size
+        )
+        self.assertEqual(len(outputs), batch_size)
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+        self.assertEqual(len(outputs[0]["audio"].shape), num_channels)
+
+    @slow
+    @require_torch
+    def test_dia_model(self):
+        speech_generator = pipeline(task="text-to-audio", model="nari-labs/Dia-1.6B-0626", device=torch_device)
+        generate_kwargs = {"max_new_tokens": 20}
+        num_channels = 1  # model generates mono audio
+
+        outputs = speech_generator("Dia is an open weights text to dialogue model.", generate_kwargs=generate_kwargs)
+        self.assertEqual(outputs["sampling_rate"], 44100)
+        audio = outputs["audio"]
+        self.assertEqual(ANY(np.ndarray), audio)
+        # ensure audio (with one channel) and not discrete codes
+        self.assertEqual(len(audio.shape), num_channels)
+
+        # test two examples side-by-side
+        outputs = speech_generator(
+            ["Dia is an open weights text to dialogue model.", "This is a second example."],
+            generate_kwargs=generate_kwargs,
+        )
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+        self.assertEqual(len(audio[0].shape), num_channels)
+
+        # test batching
+        batch_size = 2
+        outputs = speech_generator(
+            ["Dia is an open weights text to dialogue model.", "This is a second example."],
+            generate_kwargs=generate_kwargs,
+            batch_size=2,
+        )
+        self.assertEqual(len(outputs), batch_size)
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
+        self.assertEqual(len(outputs[0]["audio"].shape), num_channels)
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        model_test_kwargs = {}
+        if model.can_generate():  # not all models in this pipeline can generate and, therefore, take `generate` kwargs
+            model_test_kwargs["max_new_tokens"] = 5
+        model.config._attn_implementation = "eager"
+        speech_generator = TextToAudioPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+            **model_test_kwargs,
+        )
+
+        return speech_generator, ["This is a test", "Another test"]
+
+    def run_pipeline_test(self, speech_generator, _):
+        outputs = speech_generator("This is a test")
+        self.assertEqual(ANY(np.ndarray), outputs["audio"])
+
+        forward_params = (
+            {"num_return_sequences": 2, "do_sample": True} if speech_generator.model.can_generate() else {}
+        )
+        outputs = speech_generator(["This is great !", "Something else"], forward_params=forward_params)
+        audio = [output["audio"] for output in outputs]
+        self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio)
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -0,0 +1,124 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from huggingface_hub import VideoClassificationOutputElement, hf_hub_download
+
+from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEImageProcessor
+from transformers.pipelines import VideoClassificationPipeline, pipeline
+from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
+    is_pipeline_test,
+    nested_simplify,
+    require_av,
+    require_torch,
+    require_vision,
+)
+
+from .test_pipelines_common import ANY
+
+
+@is_pipeline_test
+@require_torch
+@require_vision
+@require_av
+class VideoClassificationPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
+    example_video_filepath = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls.example_video_filepath is None:
+            cls.example_video_filepath = hf_hub_download(
+                repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
+            )
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        self._load_dataset()
+        video_classifier = VideoClassificationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+            top_k=2,
+        )
+        examples = [
+            self.example_video_filepath,
+            # TODO: re-enable this once we have a stable hub solution for CI
+            # "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
+        ]
+        return video_classifier, examples
+
+    def run_pipeline_test(self, video_classifier, examples):
+        for example in examples:
+            outputs = video_classifier(example)
+
+            self.assertEqual(
+                outputs,
+                [
+                    {"score": ANY(float), "label": ANY(str)},
+                    {"score": ANY(float), "label": ANY(str)},
+                ],
+            )
+            for element in outputs:
+                compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
+
+    @require_torch
+    def test_small_model_pt(self):
+        small_model = "hf-internal-testing/tiny-random-VideoMAEForVideoClassification"
+        small_feature_extractor = VideoMAEImageProcessor(
+            size={"shortest_edge": 10}, crop_size={"height": 10, "width": 10}
+        )
+        video_classifier = pipeline(
+            "video-classification", model=small_model, feature_extractor=small_feature_extractor, frame_sampling_rate=4
+        )
+
+        video_file_path = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
+        output = video_classifier(video_file_path, top_k=2)
+        self.assertEqual(
+            nested_simplify(output, decimals=4),
+            [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
+        )
+        for element in output:
+            compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
+
+        outputs = video_classifier(
+            [
+                video_file_path,
+                video_file_path,
+            ],
+            top_k=2,
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
+                [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
+            ],
+        )
+        for output in outputs:
+            for element in output:
+                compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
--- a/tests/pipelines/test_pipelines_zero_shot.py
+++ b/tests/pipelines/test_pipelines_zero_shot.py
@@ -0,0 +1,288 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import (
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+    Pipeline,
+    ZeroShotClassificationPipeline,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_pipeline_test,
+    is_torch_available,
+    nested_simplify,
+    require_torch,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_torch_available():
+    import torch
+
+
+# These 2 model types require different inputs than those of the usual text models.
+_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
+
+
+@is_pipeline_test
+class ZeroShotClassificationPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+
+    if not hasattr(model_mapping, "is_dummy"):
+        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        classifier = ZeroShotClassificationPipeline(
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            image_processor=image_processor,
+            processor=processor,
+            dtype=dtype,
+            candidate_labels=["polics", "health"],
+        )
+        return classifier, ["Who are you voting for in 2020?", "My stomach hurts."]
+
+    def run_pipeline_test(self, classifier, _):
+        outputs = classifier("Who are you voting for in 2020?", candidate_labels="politics")
+        self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
+
+        # No kwarg
+        outputs = classifier("Who are you voting for in 2020?", ["politics"])
+        self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
+
+        outputs = classifier("Who are you voting for in 2020?", candidate_labels=["politics"])
+        self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
+
+        outputs = classifier("Who are you voting for in 2020?", candidate_labels="politics, public health")
+        self.assertEqual(
+            outputs, {"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
+        )
+        self.assertAlmostEqual(sum(nested_simplify(outputs["scores"])), 1.0)
+
+        outputs = classifier("Who are you voting for in 2020?", candidate_labels=["politics", "public health"])
+        self.assertEqual(
+            outputs, {"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
+        )
+        self.assertAlmostEqual(sum(nested_simplify(outputs["scores"])), 1.0)
+
+        outputs = classifier(
+            "Who are you voting for in 2020?", candidate_labels="politics", hypothesis_template="This text is about {}"
+        )
+        self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
+
+        # https://github.com/huggingface/transformers/issues/13846
+        outputs = classifier(["I am happy"], ["positive", "negative"])
+        self.assertEqual(
+            outputs,
+            [
+                {"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
+                for i in range(1)
+            ],
+        )
+        outputs = classifier(["I am happy", "I am sad"], ["positive", "negative"])
+        self.assertEqual(
+            outputs,
+            [
+                {"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
+                for i in range(2)
+            ],
+        )
+
+        with self.assertRaises(ValueError):
+            classifier("", candidate_labels="politics")
+
+        with self.assertRaises(TypeError):
+            classifier(None, candidate_labels="politics")
+
+        with self.assertRaises(ValueError):
+            classifier("Who are you voting for in 2020?", candidate_labels="")
+
+        with self.assertRaises(TypeError):
+            classifier("Who are you voting for in 2020?", candidate_labels=None)
+
+        with self.assertRaises(ValueError):
+            classifier(
+                "Who are you voting for in 2020?",
+                candidate_labels="politics",
+                hypothesis_template="Not formatting template",
+            )
+
+        with self.assertRaises(AttributeError):
+            classifier(
+                "Who are you voting for in 2020?",
+                candidate_labels="politics",
+                hypothesis_template=None,
+            )
+
+        self.run_entailment_id(classifier)
+
+    def run_entailment_id(self, zero_shot_classifier: Pipeline):
+        config = zero_shot_classifier.model.config
+        original_label2id = config.label2id
+        original_entailment = zero_shot_classifier.entailment_id
+
+        config.label2id = {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}
+        self.assertEqual(zero_shot_classifier.entailment_id, -1)
+
+        config.label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
+        self.assertEqual(zero_shot_classifier.entailment_id, 0)
+
+        config.label2id = {"ENTAIL": 0, "NON-ENTAIL": 1}
+        self.assertEqual(zero_shot_classifier.entailment_id, 0)
+
+        config.label2id = {"ENTAIL": 2, "NEUTRAL": 1, "CONTR": 0}
+        self.assertEqual(zero_shot_classifier.entailment_id, 2)
+
+        zero_shot_classifier.model.config.label2id = original_label2id
+        self.assertEqual(original_entailment, zero_shot_classifier.entailment_id)
+
+    @require_torch
+    def test_truncation(self):
+        zero_shot_classifier = pipeline(
+            "zero-shot-classification",
+            model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
+        )
+        # There was a regression in 4.10 for this
+        # Adding a test so we don't make the mistake again.
+        # https://github.com/huggingface/transformers/issues/13381#issuecomment-912343499
+        zero_shot_classifier(
+            "Who are you voting for in 2020?" * 100, candidate_labels=["politics", "public health", "science"]
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        zero_shot_classifier = pipeline(
+            "zero-shot-classification",
+            model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
+        )
+        outputs = zero_shot_classifier(
+            "Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
+        )
+
+        self.assertEqual(
+            nested_simplify(outputs),
+            {
+                "sequence": "Who are you voting for in 2020?",
+                "labels": ["science", "public health", "politics"],
+                "scores": [0.333, 0.333, 0.333],
+            },
+        )
+
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        zero_shot_classifier = pipeline(
+            "zero-shot-classification",
+            model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
+            dtype=torch.float16,
+        )
+        outputs = zero_shot_classifier(
+            "Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
+        )
+
+        self.assertEqual(
+            nested_simplify(outputs),
+            {
+                "sequence": "Who are you voting for in 2020?",
+                "labels": ["science", "public health", "politics"],
+                "scores": [0.333, 0.333, 0.333],
+            },
+        )
+
+    @require_torch
+    def test_small_model_pt_bf16(self):
+        zero_shot_classifier = pipeline(
+            "zero-shot-classification",
+            model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
+            dtype=torch.bfloat16,
+        )
+        outputs = zero_shot_classifier(
+            "Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
+        )
+
+        self.assertEqual(
+            nested_simplify(outputs),
+            {
+                "sequence": "Who are you voting for in 2020?",
+                "labels": ["science", "public health", "politics"],
+                "scores": [0.333, 0.333, 0.333],
+            },
+        )
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        zero_shot_classifier = pipeline("zero-shot-classification", model="FacebookAI/roberta-large-mnli")
+        outputs = zero_shot_classifier(
+            "Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
+        )
+
+        self.assertEqual(
+            nested_simplify(outputs),
+            {
+                "sequence": "Who are you voting for in 2020?",
+                "labels": ["politics", "public health", "science"],
+                "scores": [0.976, 0.015, 0.009],
+            },
+        )
+        outputs = zero_shot_classifier(
+            "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks"
+            " in an encoder-decoder configuration. The best performing models also connect the encoder and decoder"
+            " through an attention mechanism. We propose a new simple network architecture, the Transformer, based"
+            " solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two"
+            " machine translation tasks show these models to be superior in quality while being more parallelizable"
+            " and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014"
+            " English-to-German translation task, improving over the existing best results, including ensembles by"
+            " over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new"
+            " single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small"
+            " fraction of the training costs of the best models from the literature. We show that the Transformer"
+            " generalizes well to other tasks by applying it successfully to English constituency parsing both with"
+            " large and limited training data.",
+            candidate_labels=["machine learning", "statistics", "translation", "vision"],
+            multi_label=True,
+        )
+        self.assertEqual(
+            nested_simplify(outputs),
+            {
+                "sequence": (
+                    "The dominant sequence transduction models are based on complex recurrent or convolutional neural"
+                    " networks in an encoder-decoder configuration. The best performing models also connect the"
+                    " encoder and decoder through an attention mechanism. We propose a new simple network"
+                    " architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence"
+                    " and convolutions entirely. Experiments on two machine translation tasks show these models to be"
+                    " superior in quality while being more parallelizable and requiring significantly less time to"
+                    " train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task,"
+                    " improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014"
+                    " English-to-French translation task, our model establishes a new single-model state-of-the-art"
+                    " BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training"
+                    " costs of the best models from the literature. We show that the Transformer generalizes well to"
+                    " other tasks by applying it successfully to English constituency parsing both with large and"
+                    " limited training data."
+                ),
+                "labels": ["translation", "machine learning", "vision", "statistics"],
+                "scores": [0.817, 0.713, 0.018, 0.018],
+            },
+        )
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -0,0 +1,92 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from datasets import load_dataset
+
+from transformers.pipelines import pipeline
+from transformers.testing_utils import is_pipeline_test, nested_simplify, require_torch, slow
+
+
+@is_pipeline_test
+@require_torch
+class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
+    # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
+    # and only CLAP would be there for now.
+    # model_mapping = {CLAPConfig: CLAPModel}
+
+    @require_torch
+    def test_small_model_pt(self, dtype="float32"):
+        audio_classifier = pipeline(
+            task="zero-shot-audio-classification",
+            model="hf-internal-testing/tiny-clap-htsat-unfused",
+            dtype=dtype,
+        )
+        dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+        audio = dataset["train"]["audio"][-1]["array"]
+        output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+        self.assertEqual(
+            nested_simplify(output),
+            [{"score": 0.501, "label": "Sound of a dog"}, {"score": 0.499, "label": "Sound of vaccum cleaner"}],
+        )
+
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(dtype="float16")
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        audio_classifier = pipeline(
+            task="zero-shot-audio-classification",
+            model="laion/clap-htsat-unfused",
+        )
+        # This is an audio of a dog
+        dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+        audio = dataset["train"]["audio"][-1]["array"]
+        output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 1.0, "label": "Sound of a dog"},
+                {"score": 0.0, "label": "Sound of vaccum cleaner"},
+            ],
+        )
+
+        output = audio_classifier([audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 1.0, "label": "Sound of a dog"},
+                    {"score": 0.0, "label": "Sound of vaccum cleaner"},
+                ],
+            ]
+            * 5,
+        )
+        output = audio_classifier(
+            [audio] * 5, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"], batch_size=5
+        )
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 1.0, "label": "Sound of a dog"},
+                    {"score": 0.0, "label": "Sound of vaccum cleaner"},
+                ],
+            ]
+            * 5,
+        )
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -0,0 +1,248 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from huggingface_hub import ZeroShotImageClassificationOutputElement
+
+from transformers import is_vision_available
+from transformers.pipelines import pipeline
+from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
+    is_pipeline_test,
+    nested_simplify,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
+    # Deactivating auto tests since we don't have a good MODEL_FOR_XX mapping,
+    # and only CLIP would be there for now.
+    # model_mapping = {CLIPConfig: CLIPModel}
+
+    # def get_test_pipeline(self, model, tokenizer, processor):
+    #     if tokenizer is None:
+    #         # Side effect of no Fast Tokenizer class for these model, so skipping
+    #         # But the slow tokenizer test should still run as they're quite small
+    #         self.skipTest(reason="No tokenizer available")
+    #         return
+    #         # return None, None
+
+    #     image_classifier = ZeroShotImageClassificationPipeline(
+    #         model=model, tokenizer=tokenizer, feature_extractor=processor
+    #     )
+
+    #     # test with a raw waveform
+    #     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     return image_classifier, [image, image2]
+
+    # def run_pipeline_test(self, pipe, examples):
+    #     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    #     outputs = pipe(image, candidate_labels=["A", "B"])
+    #     self.assertEqual(outputs, {"text": ANY(str)})
+
+    #     # Batching
+    #     outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"])
+
+    @require_torch
+    def test_small_model_pt(self, dtype="float32"):
+        image_classifier = pipeline(
+            model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", dtype=dtype
+        )
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = image_classifier(image, candidate_labels=["a", "b", "c"])
+
+        # The floating scores are so close, we enter floating error approximation and the order is not guaranteed across
+        # python and torch versions.
+        self.assertIn(
+            nested_simplify(output),
+            [
+                [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "b"}, {"score": 0.333, "label": "c"}],
+                [{"score": 0.333, "label": "a"}, {"score": 0.333, "label": "c"}, {"score": 0.333, "label": "b"}],
+                [{"score": 0.333, "label": "b"}, {"score": 0.333, "label": "a"}, {"score": 0.333, "label": "c"}],
+            ],
+        )
+
+        output = image_classifier([image] * 5, candidate_labels=["A", "B", "C"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            # Pipeline outputs are supposed to be deterministic and
+            # So we could in theory have real values "A", "B", "C" instead
+            # of ANY(str).
+            # However it seems that in this particular case, the floating
+            # scores are so close, we enter floating error approximation
+            # and the order is not guaranteed anymore with batching.
+            [
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+                [
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                    {"score": 0.333, "label": ANY(str)},
+                ],
+            ],
+        )
+
+        for single_output in output:
+            for element in single_output:
+                compare_pipeline_output_to_hub_spec(element, ZeroShotImageClassificationOutputElement)
+
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(dtype="float16")
+
+    @slow
+    @require_torch
+    def test_large_model_pt(self):
+        image_classifier = pipeline(
+            task="zero-shot-image-classification",
+            model="openai/clip-vit-base-patch32",
+        )
+        # This is an image of 2 cats with remotes and no planes
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = image_classifier(image, candidate_labels=["cat", "plane", "remote"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 0.511, "label": "remote"},
+                {"score": 0.485, "label": "cat"},
+                {"score": 0.004, "label": "plane"},
+            ],
+        )
+
+        output = image_classifier([image] * 5, candidate_labels=["cat", "plane", "remote"], batch_size=2)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.511, "label": "remote"},
+                    {"score": 0.485, "label": "cat"},
+                    {"score": 0.004, "label": "plane"},
+                ],
+            ]
+            * 5,
+        )
+
+    @slow
+    @require_torch
+    def test_siglip_model_pt(self):
+        image_classifier = pipeline(
+            task="zero-shot-image-classification",
+            model="google/siglip-base-patch16-224",
+        )
+        # This is an image of 2 cats with remotes and no planes
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = image_classifier(image, candidate_labels=["2 cats", "a plane", "a remote"])
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 0.198, "label": "2 cats"},
+                {"score": 0.0, "label": "a remote"},
+                {"score": 0.0, "label": "a plane"},
+            ],
+        )
+
+        output = image_classifier([image] * 5, candidate_labels=["2 cats", "a plane", "a remote"], batch_size=2)
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.198, "label": "2 cats"},
+                    {"score": 0.0, "label": "a remote"},
+                    {"score": 0.0, "label": "a plane"},
+                ]
+            ]
+            * 5,
+        )
+
+    @slow
+    @require_torch
+    def test_blip2_model_pt(self):
+        image_classifier = pipeline(
+            task="zero-shot-image-classification",
+            model="Salesforce/blip2-itm-vit-g",
+        )
+        # This is an image of 2 cats with remotes and no planes
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        output = image_classifier(
+            image,
+            candidate_labels=["2 cats", "a plane", "a remote"],
+            tokenizer_kwargs={"return_token_type_ids": False},
+        )
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"score": 0.369, "label": "2 cats"},
+                {"score": 0.333, "label": "a remote"},
+                {"score": 0.297, "label": "a plane"},
+            ],
+        )
+
+        output = image_classifier(
+            [image] * 5,
+            candidate_labels=["2 cats", "a plane", "a remote"],
+            batch_size=2,
+            tokenizer_kwargs={"return_token_type_ids": False},
+        )
+
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                [
+                    {"score": 0.369, "label": "2 cats"},
+                    {"score": 0.333, "label": "a remote"},
+                    {"score": 0.297, "label": "a plane"},
+                ]
+            ]
+            * 5,
+        )
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -0,0 +1,235 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import (
+    MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
+    ZeroShotObjectDetectionPipeline,
+    is_vision_available,
+    pipeline,
+)
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+@require_torch
+class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
+
+    def get_test_pipeline(
+        self,
+        model,
+        tokenizer=None,
+        image_processor=None,
+        feature_extractor=None,
+        processor=None,
+        dtype="float32",
+    ):
+        object_detector = ZeroShotObjectDetectionPipeline(
+            model=model,
+            processor=processor,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            dtype=dtype,
+        )
+
+        examples = [
+            {
+                "image": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "candidate_labels": ["cat", "remote", "couch"],
+            }
+        ]
+        return object_detector, examples
+
+    def run_pipeline_test(self, object_detector, examples):
+        outputs = object_detector(examples[0].get("image"), examples[0].get("candidate_labels"), threshold=0.0)
+
+        n = len(outputs)
+        self.assertGreater(n, 0)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "score": ANY(float),
+                    "label": ANY(str),
+                    "box": {"xmin": ANY(int), "ymin": ANY(int), "xmax": ANY(int), "ymax": ANY(int)},
+                }
+                for i in range(n)
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        object_detector = pipeline(
+            "zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection"
+        )
+
+        outputs = object_detector(
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            candidate_labels=["cat", "remote", "couch"],
+            threshold=0.64,
+        )
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.7235, "label": "cat", "box": {"xmin": 204, "ymin": 167, "xmax": 232, "ymax": 190}},
+                {"score": 0.7218, "label": "remote", "box": {"xmin": 204, "ymin": 167, "xmax": 232, "ymax": 190}},
+                {"score": 0.7184, "label": "couch", "box": {"xmin": 204, "ymin": 167, "xmax": 232, "ymax": 190}},
+                {"score": 0.6748, "label": "remote", "box": {"xmin": 571, "ymin": 83, "xmax": 598, "ymax": 103}},
+                {"score": 0.6656, "label": "cat", "box": {"xmin": 571, "ymin": 83, "xmax": 598, "ymax": 103}},
+                {"score": 0.6614, "label": "couch", "box": {"xmin": 571, "ymin": 83, "xmax": 598, "ymax": 103}},
+                {"score": 0.6456, "label": "remote", "box": {"xmin": 494, "ymin": 105, "xmax": 521, "ymax": 127}},
+                {"score": 0.642, "label": "remote", "box": {"xmin": 67, "ymin": 274, "xmax": 93, "ymax": 297}},
+                {"score": 0.6419, "label": "cat", "box": {"xmin": 494, "ymin": 105, "xmax": 521, "ymax": 127}},
+            ],
+        )
+
+        outputs = object_detector(
+            [
+                {
+                    "image": "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                    "candidate_labels": ["cat", "remote", "couch"],
+                }
+            ],
+            threshold=0.64,
+        )
+
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.7235, "label": "cat", "box": {"xmin": 204, "ymin": 167, "xmax": 232, "ymax": 190}},
+                    {"score": 0.7218, "label": "remote", "box": {"xmin": 204, "ymin": 167, "xmax": 232, "ymax": 190}},
+                    {"score": 0.7184, "label": "couch", "box": {"xmin": 204, "ymin": 167, "xmax": 232, "ymax": 190}},
+                    {"score": 0.6748, "label": "remote", "box": {"xmin": 571, "ymin": 83, "xmax": 598, "ymax": 103}},
+                    {"score": 0.6656, "label": "cat", "box": {"xmin": 571, "ymin": 83, "xmax": 598, "ymax": 103}},
+                    {"score": 0.6614, "label": "couch", "box": {"xmin": 571, "ymin": 83, "xmax": 598, "ymax": 103}},
+                    {"score": 0.6456, "label": "remote", "box": {"xmin": 494, "ymin": 105, "xmax": 521, "ymax": 127}},
+                    {"score": 0.642, "label": "remote", "box": {"xmin": 67, "ymin": 274, "xmax": 93, "ymax": 297}},
+                    {"score": 0.6419, "label": "cat", "box": {"xmin": 494, "ymin": 105, "xmax": 521, "ymax": 127}},
+                ]
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_large_model_pt(self):
+        object_detector = pipeline("zero-shot-object-detection")
+
+        outputs = object_detector(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            candidate_labels=["cat", "remote", "couch"],
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.2868, "label": "cat", "box": {"xmin": 324, "ymin": 20, "xmax": 640, "ymax": 373}},
+                {"score": 0.277, "label": "remote", "box": {"xmin": 40, "ymin": 72, "xmax": 177, "ymax": 115}},
+                {"score": 0.2537, "label": "cat", "box": {"xmin": 1, "ymin": 55, "xmax": 315, "ymax": 472}},
+                {"score": 0.1474, "label": "remote", "box": {"xmin": 335, "ymin": 74, "xmax": 371, "ymax": 187}},
+                {"score": 0.1208, "label": "couch", "box": {"xmin": 4, "ymin": 0, "xmax": 642, "ymax": 476}},
+            ],
+        )
+
+        outputs = object_detector(
+            [
+                {
+                    "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+                    "candidate_labels": ["cat", "remote", "couch"],
+                },
+                {
+                    "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+                    "candidate_labels": ["cat", "remote", "couch"],
+                },
+            ],
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.2868, "label": "cat", "box": {"xmin": 324, "ymin": 20, "xmax": 640, "ymax": 373}},
+                    {"score": 0.277, "label": "remote", "box": {"xmin": 40, "ymin": 72, "xmax": 177, "ymax": 115}},
+                    {"score": 0.2537, "label": "cat", "box": {"xmin": 1, "ymin": 55, "xmax": 315, "ymax": 472}},
+                    {"score": 0.1474, "label": "remote", "box": {"xmin": 335, "ymin": 74, "xmax": 371, "ymax": 187}},
+                    {"score": 0.1208, "label": "couch", "box": {"xmin": 4, "ymin": 0, "xmax": 642, "ymax": 476}},
+                ],
+                [
+                    {"score": 0.2868, "label": "cat", "box": {"xmin": 324, "ymin": 20, "xmax": 640, "ymax": 373}},
+                    {"score": 0.277, "label": "remote", "box": {"xmin": 40, "ymin": 72, "xmax": 177, "ymax": 115}},
+                    {"score": 0.2537, "label": "cat", "box": {"xmin": 1, "ymin": 55, "xmax": 315, "ymax": 472}},
+                    {"score": 0.1474, "label": "remote", "box": {"xmin": 335, "ymin": 74, "xmax": 371, "ymax": 187}},
+                    {"score": 0.1208, "label": "couch", "box": {"xmin": 4, "ymin": 0, "xmax": 642, "ymax": 476}},
+                ],
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_threshold(self):
+        threshold = 0.2
+        object_detector = pipeline("zero-shot-object-detection")
+
+        outputs = object_detector(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            candidate_labels=["cat", "remote", "couch"],
+            threshold=threshold,
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.2868, "label": "cat", "box": {"xmin": 324, "ymin": 20, "xmax": 640, "ymax": 373}},
+                {"score": 0.277, "label": "remote", "box": {"xmin": 40, "ymin": 72, "xmax": 177, "ymax": 115}},
+                {"score": 0.2537, "label": "cat", "box": {"xmin": 1, "ymin": 55, "xmax": 315, "ymax": 472}},
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_top_k(self):
+        top_k = 2
+        object_detector = pipeline("zero-shot-object-detection")
+
+        outputs = object_detector(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            candidate_labels=["cat", "remote", "couch"],
+            top_k=top_k,
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.2868, "label": "cat", "box": {"xmin": 324, "ymin": 20, "xmax": 640, "ymax": 373}},
+                {"score": 0.277, "label": "remote", "box": {"xmin": 40, "ymin": 72, "xmax": 177, "ymax": 115}},
+            ],
+        )