first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/alm_tester.py
+++ b/tests/alm_tester.py
@@ -0,0 +1,250 @@
+# Copyright 2026 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import random
+from inspect import signature
+from unittest.mock import patch
+
+from .multimodal_tester import MultiModalModelTest, MultiModalModelTester
+from .test_modeling_common import (
+    floats_tensor,
+    ids_tensor,
+    is_torch_available,
+    torch_device,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+class ALMModelTester(MultiModalModelTester):
+    audio_config_class = None
+    audio_config_key = "audio_config"
+    # Name under which the audio mask is passed to the model's forward (e.g. "feature_attention_mask"
+    # for Qwen2Audio). Leave as `None` if the model does not consume a separate audio-level mask;
+    # `_prepare_modality_inputs` then skips adding it to the inputs dict.
+    audio_mask_key = None
+    _required_attributes = MultiModalModelTester._required_attributes + ("audio_config_class",)
+
+    @property
+    def pipeline_model_mapping(self):
+        # TODO: @eustlb, we don't have pipeline testing for audio-text-to-text
+        mapping = {
+            "feature-extraction": self.base_model_class,
+            # "audio-text-to-text": self.conditional_generation_class,
+        }
+        # TODO: should we add automatic-speech-recognition with a special flag?
+        return mapping
+
+    def __init__(self, parent, **kwargs):
+        # Overrides of _TEXT_MODEL_TESTER_DEFAULTS
+        kwargs.setdefault("seq_length", 32)
+        kwargs.setdefault("pad_token_id", 1)
+
+        # ALM-specific defaults
+        kwargs.setdefault("feat_seq_length", 128)
+        kwargs.setdefault("num_mel_bins", 80)
+        kwargs.setdefault("audio_token_id", 0)
+
+        super().__init__(parent, **kwargs)
+
+    # -- Overridable ALM-specific hooks ------------------------------------------------------
+
+    def create_audio_features(self):
+        """Create audio feature tensor. Override for different shapes (e.g. [B, T, features])."""
+        return floats_tensor([self.batch_size, self.num_mel_bins, self.feat_seq_length])
+
+    def get_audio_embeds_mask(self, audio_embeds_mask):
+        """Get audio embeds mask from audio mask. Override for different shapes."""
+        raise NotImplementedError("This method should be overridden in the subclass")
+
+    def place_audio_tokens(self, input_ids, config, num_audio_tokens):
+        """Place audio placeholder tokens contiguously after BOS. Override for different placement.
+
+        Deterministic placement (position 0 reserved for BOS; audio tokens at [1:1+n]) keeps
+        the tail of each sequence text-only, which downstream tests (e.g. resize_token_embeddings
+        overwriting column -2) rely on.
+        """
+        input_ids = input_ids.clone()
+        input_ids[input_ids == self.audio_token_id] = self.pad_token_id
+        for i in range(input_ids.shape[0]):
+            n = num_audio_tokens[i].item() if isinstance(num_audio_tokens, torch.Tensor) else num_audio_tokens
+            if 1 + int(n) > self.seq_length:
+                raise ValueError(
+                    f"Cannot place {int(n)} audio tokens after BOS in a sequence of length {self.seq_length}. "
+                    "This likely indicates a mismatch between your feature extraction/configuration and your sequence length. "
+                    "Please ensure `seq_length` is >= the number of audio embedding positions + 1."
+                )
+            input_ids[i, 1 : 1 + int(n)] = self.audio_token_id
+        return input_ids
+
+    def get_audio_feature_key(self):
+        """Key name for audio features in the inputs dict."""
+        return "input_features"
+
+    def create_audio_mask(self):
+        """Create audio-level attention mask with contiguous valid regions per batch element.
+
+        Each element gets a random offset and length, producing masks like [0, 0, 1, 1, 1, 0, 0].
+        At least one batch index is pinned to a full-length mask.
+        """
+        # Use a locally-seeded RNG so repeated calls within a test produce the same mask
+        rng = random.Random(0)
+        # Sample lengths in [1, feat_seq_length] and offsets in [0, feat_seq_length - length]
+        lengths = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length, rng=rng).abs() + 1
+        lengths = lengths.clamp(max=self.feat_seq_length)
+
+        # Presuming feat_seq_length is set correctly, ensure at least one batch has a full-length mask for valid audio tokens
+        lengths[rng.randint(0, self.batch_size - 1)] = self.feat_seq_length
+        offsets = ids_tensor([self.batch_size], vocab_size=self.feat_seq_length, rng=rng).abs()
+        offsets = offsets % (self.feat_seq_length - lengths + 1)
+
+        positions = torch.arange(self.feat_seq_length, device=torch_device)[None, :]
+        audio_mask = ((positions >= offsets[:, None]) & (positions < offsets[:, None] + lengths[:, None])).long()
+        return audio_mask
+
+    # -- Hooks consumed by the shared base ---------------------------------------------------
+
+    @property
+    def _special_token_ids(self):
+        return super()._special_token_ids | {self.audio_token_id}
+
+    def _build_modality_sub_configs(self):
+        return {self.audio_config_key: self.get_audio_config()}
+
+    def _prepare_modality_inputs(self, input_ids, config):
+        audio_features = self.create_audio_features()
+        audio_mask = self.create_audio_mask()
+        audio_embeds_mask = self.get_audio_embeds_mask(audio_mask)
+        num_audio_tokens = audio_embeds_mask.sum(dim=1)
+        input_ids = self.place_audio_tokens(input_ids, config, num_audio_tokens)
+
+        modality_inputs = {self.get_audio_feature_key(): audio_features}
+        if self.audio_mask_key is not None:
+            modality_inputs[self.audio_mask_key] = audio_mask
+        return input_ids, modality_inputs
+
+    # -- Audio sub-config construction -------------------------------------------------------
+
+    @property
+    def audio_config_args(self):
+        return list(signature(self.audio_config_class.__init__).parameters.keys())
+
+    def get_audio_config(self):
+        kwargs = self._collect_kwargs(self.audio_config_args, self.audio_config_class)
+        return self.audio_config_class(**kwargs)
+
+
+class ALMModelTest(MultiModalModelTest):
+    """
+    Base test class for Audio-Language Models.
+
+    Subclasses should set:
+    - `model_tester_class`: The tester class (subclass of ALMModelTester)
+
+    Optional:
+    - `all_model_classes`: Override if not using default from model_tester
+    - `pipeline_model_mapping`: Override if not using default from model_tester
+    """
+
+    def test_sdpa_can_dispatch_on_flash(self):
+        # `test_sdpa_can_dispatch_on_flash` already pops the attention mask, but we cannot simply pop the
+        # audio mask here since it will raise an error in `get_audio_features` (cf. `test_mismatching_num_audio_tokens`).
+        # Therefore we substitute a full-ones mask instead.
+        def full_ones_mask():
+            return torch.ones(
+                [self.model_tester.batch_size, self.model_tester.feat_seq_length],
+                dtype=torch.bool,
+                device=torch_device,
+            )
+
+        with patch.object(self.model_tester, "create_audio_mask", new=full_ones_mask):
+            super().test_sdpa_can_dispatch_on_flash()
+
+    def test_mismatching_num_audio_tokens(self):
+        """
+        Tests that ALMs throw an error with explicit message saying what is wrong
+        when number of audios don't match number of audio tokens in the text.
+        Also we need to test multi-audio cases when one prompt has multiple audio tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        audio_feature_key = self.model_tester.get_audio_feature_key()
+        audio_mask_key = self.model_tester.audio_mask_key
+
+        # Pick the batch index `create_audio_mask` pinned to full length — guaranteed to
+        # contribute > 0 audio tokens even for encoders that aggressively downsample
+        # (e.g. GlmAsr), so duplicating it in Test 2 reliably moves the audio-token total.
+        audio_token_id = self.model_tester.audio_token_id
+        dup_idx = int((input_dict["input_ids"] == audio_token_id).sum(-1).argmax().item())
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+            curr_input_dict = copy.deepcopy(input_dict)
+            _ = model(**curr_input_dict)  # successful forward with no modifications
+
+            # Test 1: remove one audio but leave the audio tokens in the text
+            curr_input_dict[audio_feature_key] = curr_input_dict[audio_feature_key][-1:, ...]
+            if audio_mask_key is not None:
+                curr_input_dict[audio_mask_key] = curr_input_dict[audio_mask_key][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**curr_input_dict)
+
+            # Test 2: add one audio but leave the audio tokens in the text
+            curr_input_dict = copy.deepcopy(input_dict)
+            curr_input_dict[audio_feature_key] = torch.cat(
+                [curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key][dup_idx : dup_idx + 1, ...]],
+                dim=0,
+            )
+            if audio_mask_key is not None:
+                curr_input_dict[audio_mask_key] = torch.cat(
+                    [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key][dup_idx : dup_idx + 1, ...]],
+                    dim=0,
+                )
+            with self.assertRaises(ValueError):
+                _ = model(**curr_input_dict)
+
+            # Test 3: duplicate the text along the seq dim so each prompt has twice as many
+            # audio tokens, while leaving the audio features unchanged -> mismatch
+            curr_input_dict = copy.deepcopy(input_dict)
+            curr_input_dict["input_ids"] = torch.cat(
+                [curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1
+            )
+            curr_input_dict["attention_mask"] = torch.cat(
+                [curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1
+            )
+            with self.assertRaises(ValueError):
+                _ = model(**curr_input_dict)
+
+            # Test 4: multi-audio valid case. A prompt may contain multiple audio segments;
+            # all audio segments are concatenated along the batch dim on the audio side.
+            # Duplicating input_ids along seq dim (-> [audios, audios] per prompt) and the
+            # audio features along batch dim (-> batch_size * 2) must forward successfully.
+            curr_input_dict = copy.deepcopy(input_dict)
+            curr_input_dict["input_ids"] = torch.cat(
+                [curr_input_dict["input_ids"], curr_input_dict["input_ids"]], dim=1
+            )
+            curr_input_dict["attention_mask"] = torch.cat(
+                [curr_input_dict["attention_mask"], curr_input_dict["attention_mask"]], dim=1
+            )
+            curr_input_dict[audio_feature_key] = torch.cat(
+                [curr_input_dict[audio_feature_key], curr_input_dict[audio_feature_key]], dim=0
+            )
+            if audio_mask_key is not None:
+                curr_input_dict[audio_mask_key] = torch.cat(
+                    [curr_input_dict[audio_mask_key], curr_input_dict[audio_mask_key]], dim=0
+                )
+            _ = model(**curr_input_dict)
--- a/tests/causal_lm_tester.py
+++ b/tests/causal_lm_tester.py
@@ -0,0 +1,636 @@
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+from inspect import signature
+
+import pytest
+from parameterized import parameterized
+
+from transformers import AutoModelForCausalLM, PreTrainedConfig, set_seed
+from transformers.models.auto.auto_factory import getattribute_from_module
+from transformers.testing_utils import (
+    _COMMON_MODEL_NAMES_MAP,
+    _TEXT_MODEL_TESTER_DEFAULTS,
+    is_flaky,
+    require_flash_attn,
+    require_torch_accelerator,
+    slow,
+)
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import (
+    GenerationTesterMixin,
+    ModelTesterMixin,
+    ids_tensor,
+    is_torch_available,
+    require_torch,
+    torch_device,
+)
+from .test_pipeline_mixin import PipelineTesterMixin
+from .test_tensor_parallel_mixin import TensorParallelTesterMixin
+from .test_training_mixin import TrainingTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+class CausalLMModelTester:
+    # If the model follows the standard naming conventions, only `base_model_class` needs to be set (the others are
+    # inferred from available public classes).
+    base_model_class = None
+    # ⚠️ Don't set these unless the model does NOT follow the standard naming conventions ⚠️
+    config_class = None
+    causal_lm_class = None
+    question_answering_class = None
+    sequence_classification_class = None
+    token_classification_class = None
+    # These attributes are required after the initialization phase of the tester.
+    _required_attributes = ("base_model_class", "config_class", "causal_lm_class")
+
+    # Arguments that should be passed to the config class even if not in its signature
+    forced_config_args = ["pad_token_id"]
+
+    @classmethod
+    def _verify_and_infer_model_attributes(cls):
+        """
+        Verifies that the required tester attributes are set correctly, and infers unset tester attributes.
+        Intentionally nitpicks the tester class attributes, to prevent human errors.
+        """
+        # `base_model_class` is mandatory, and it must be a valid model class.
+        base_model_class = getattr(cls, "base_model_class")
+        if base_model_class is None or "PreTrainedModel" not in str(base_model_class.__mro__):
+            raise ValueError(
+                f"You have inherited from `CausalLMModelTester` but did not set the `base_model_class` "
+                f"attribute to a valid model class. (It's set to `{base_model_class}`)"
+            )
+
+        # Infers other model classes from the base class name and available public classes, if the corresponding
+        # attributes are not set explicitly. If they are set, they must be set to a valid class (config or model).
+        model_name = base_model_class.__name__.replace("Model", "")
+        base_class_module = ".".join(base_model_class.__module__.split(".")[:-1])
+        for tester_attribute_name, model_class_termination in _COMMON_MODEL_NAMES_MAP.items():
+            if getattr(cls, tester_attribute_name) is None:
+                try:
+                    model_class = getattribute_from_module(base_class_module, model_name + model_class_termination)
+                    setattr(cls, tester_attribute_name, model_class)
+                except ValueError:
+                    pass
+            else:
+                if tester_attribute_name == "config_class":
+                    if "PreTrainedConfig" not in str(getattr(cls, tester_attribute_name).__mro__):
+                        raise ValueError(
+                            f"You have inherited from `CausalLMModelTester` but did not set the "
+                            f"`{tester_attribute_name}` attribute to a valid config class. (It's set to "
+                            f"`{getattr(cls, tester_attribute_name)}`). If the config class follows a standard "
+                            f"naming convention, you should unset `{tester_attribute_name}`."
+                        )
+                else:
+                    if "PreTrainedModel" not in str(getattr(cls, tester_attribute_name).__mro__):
+                        raise ValueError(
+                            f"You have inherited from `CausalLMModelTester` but did not set the "
+                            f"`{tester_attribute_name}` attribute to a valid model class. (It's set to "
+                            f"`{getattr(cls, tester_attribute_name)}`). If the model class follows a standard "
+                            f"naming convention, you should unset `{tester_attribute_name}`."
+                        )
+
+        # After inferring, if we don't have the basic classes set, we raise an error.
+        for required_attribute in cls._required_attributes:
+            if getattr(cls, required_attribute) is None:
+                raise ValueError(
+                    f"You have inherited from `CausalLMModelTester` but did not set the `{required_attribute}` "
+                    "attribute. It can't be automatically inferred either -- this means it is not following a "
+                    "standard naming convention. If this is intentional, please set the attribute explicitly."
+                )
+
+        # To prevent issues with typos, no other attributes can be set to a model class
+        for instance_attribute_name, instance_attribute in cls.__dict__.items():
+            if (
+                (
+                    instance_attribute_name not in _COMMON_MODEL_NAMES_MAP
+                    and instance_attribute_name != "base_model_class"
+                )
+                and isinstance(instance_attribute, type)
+                and "PreTrainedModel" in str(instance_attribute.__mro__)
+            ):
+                raise ValueError(
+                    f"You have inherited from `CausalLMModelTester` but set an unexpected attribute to a model class "
+                    f"(`{instance_attribute_name}` is set to `{instance_attribute}`). "
+                    f"Only the following attributes can be set to model classes: {_COMMON_MODEL_NAMES_MAP.keys()}."
+                )
+
+    @property
+    def all_model_classes(self):
+        # Models that set `all_model_classes` in their `XXXModelTest` class must have a new class that doesn't fit
+        # any of the common classes.
+        return [
+            model_class
+            for model_class in (
+                self.base_model_class,
+                self.causal_lm_class,
+                self.question_answering_class,
+                self.sequence_classification_class,
+                self.token_classification_class,
+            )
+            if model_class is not None
+        ]
+
+    @property
+    def pipeline_model_mapping(self):
+        # This is the default pipeline mapping.
+        mapping = {
+            "feature-extraction": self.base_model_class,
+            "text-generation": self.causal_lm_class,
+        }
+        if self.question_answering_class is not None:
+            mapping["question-answering"] = self.question_answering_class
+        if self.sequence_classification_class is not None:
+            mapping["text-classification"] = self.sequence_classification_class
+        if self.token_classification_class is not None:
+            mapping["token-classification"] = self.token_classification_class
+        if self.sequence_classification_class is not None:
+            mapping["zero-shot"] = self.sequence_classification_class
+        return mapping
+
+    def __init__(
+        self,
+        parent,
+        use_token_type_ids=False,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        is_decoder=False,
+        scope=None,
+        mamba_n_groups=1,
+        mamba_n_heads=16,
+        mamba_d_state=16,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_chunk_size=16,
+        **kwargs,
+    ):
+        self._verify_and_infer_model_attributes()
+        self.parent = parent
+
+        # Apply shared text-model defaults, then let caller kwargs override
+        for key, default in _TEXT_MODEL_TESTER_DEFAULTS.items():
+            setattr(self, key, kwargs.pop(key, default))
+
+        # CausalLM-specific defaults (not shared with multimodal testers)
+        self.use_token_type_ids = use_token_type_ids
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.is_decoder = is_decoder
+        self.mamba_n_groups = mamba_n_groups
+        self.mamba_n_heads = mamba_n_heads
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_chunk_size = mamba_chunk_size
+        self.tie_word_embeddings = False
+
+        # Any remaining kwargs become attributes (for model-specific params)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    @property
+    def config_args(self):
+        return list(signature(self.config_class.__init__).parameters.keys())
+
+    def get_config(self):
+        kwargs = {}
+        model_name_to_common_name = {v: k for k, v in self.config_class.attribute_map.items()}
+        for k in self.config_args + self.forced_config_args:
+            if hasattr(self, k) and k != "self":
+                kwargs[k] = getattr(self, k)
+            elif k in model_name_to_common_name and hasattr(self, model_name_to_common_name[k]):
+                kwargs[k] = getattr(self, model_name_to_common_name[k])
+        return self.config_class(**kwargs)
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = self.base_model_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_ids, _, input_mask, _, _, _ = self.prepare_config_and_inputs()
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class CausalLMModelTest(
+    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, TrainingTesterMixin, TensorParallelTesterMixin
+):
+    model_tester_class = None
+    all_model_classes = None
+    pipeline_model_mapping = None
+
+    def setUp(self):
+        if self.model_tester_class is None:
+            raise ValueError(
+                "You have inherited from CausalLMModelTest but did not set the model_tester_class attribute."
+            )
+        self.model_tester = self.model_tester_class(self)
+        self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class)
+
+        if self.pipeline_model_mapping is None:
+            # If `all_model_classes` is not the default, maybe there are more pipeline mappings to be set.
+            if self.all_model_classes is not None:
+                raise ValueError(
+                    "Testes that inherit from `CausalLMModelTest` and set `all_model_classes` must manually set "
+                    "`pipeline_model_mapping`."
+                )
+            # Otherwise, we know the pipeline mapping is the default.
+            else:
+                self.pipeline_model_mapping = self.model_tester.pipeline_model_mapping
+
+        if self.all_model_classes is None:
+            self.all_model_classes = self.model_tester.all_model_classes
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_sequence_classification_model(self):
+        if self.model_tester.sequence_classification_class is None:
+            self.skipTest("Model does not support sequence classification")
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = self.model_tester.sequence_classification_class(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_sequence_classification_model_for_single_label(self):
+        if self.model_tester.sequence_classification_class is None:
+            self.skipTest("Model does not support sequence classification")
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = self.model_tester.sequence_classification_class(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_sequence_classification_model_for_multi_label(self):
+        if self.model_tester.sequence_classification_class is None:
+            self.skipTest("Model does not support sequence classification")
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = self.model_tester.sequence_classification_class(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_token_classification_model(self):
+        if self.model_tester.token_classification_class is None:
+            self.skipTest("Model does not support token classification")
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = self.model_tester.token_classification_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
+    def test_question_answering_model(self):
+        if self.model_tester.question_answering_class is None:
+            self.skipTest("Model does not support question answering")
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = self.model_tester.question_answering_class(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask)
+        self.assertEqual(
+            result.start_logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length),
+        )
+        self.assertEqual(
+            result.end_logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length),
+        )
+
+    @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        """
+        Tests that we can initialize a model with RoPE scaling in the config, that it can run a forward pass, and
+        that a few basic model output properties are honored.
+        """
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        if not _config_supports_rope_scaling(config):
+            self.skipTest("This model does not support RoPE scaling")
+
+        partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0)
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        _set_config_rope_params(
+            config,
+            {
+                "rope_type": "default",
+                "rope_theta": 10_000.0,
+                "partial_rotary_factor": partial_rotary_factor,
+                "original_max_position_embeddings": 16384,
+            },
+        )
+        original_model = self.model_tester_class.base_model_class(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        _set_config_rope_params(
+            config,
+            {
+                "rope_type": scaling_type,
+                "factor": 10.0,
+                "rope_theta": 10_000.0,
+                "partial_rotary_factor": partial_rotary_factor,
+            },
+        )
+        scaled_model = self.model_tester_class.base_model_class(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5)
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+    def test_model_rope_scaling_frequencies(self):
+        """Tests the frequency properties of the different RoPE scaling types on the model RoPE layer."""
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        if not _config_supports_rope_scaling(config):
+            self.skipTest("This model does not support RoPE scaling")
+
+        # Retrieves the RoPE layer class from the base model class. Uses `.named_modules()` to avoid hardcoding the
+        # named location of the RoPE layer class.
+        base_model = self.model_tester.base_model_class(config)
+        possible_rope_attributes = [
+            "pos_emb",
+            "rotary_emb",  # most common case
+            "global_rotary_emb",
+            "local_rotary_emb",
+        ]
+        for name, module in base_model.named_modules():
+            if any(potential_name in name for potential_name in possible_rope_attributes):
+                rope_class = type(module)
+                break
+
+        scaling_factor = 10
+        short_input_length = 10
+        partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0)
+        long_input_length = int(config.max_position_embeddings * 1.5)
+
+        # Inputs
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
+        position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
+        position_ids_short = position_ids_short.unsqueeze(0)
+        position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
+        position_ids_long = position_ids_long.unsqueeze(0)
+
+        # Sanity check original RoPE
+        _set_config_rope_params(
+            config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": partial_rotary_factor}
+        )
+        original_rope = rope_class(config=config).to(torch_device)
+        original_cos_short, original_sin_short = original_rope(x, position_ids_short)
+        original_cos_long, original_sin_long = original_rope(x, position_ids_long)
+        torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :])
+
+        # Sanity check linear RoPE scaling
+        # New position "x" should match original position with index "x/scaling_factor"
+        _set_config_rope_params(
+            config,
+            {
+                "rope_type": "linear",
+                "factor": scaling_factor,
+                "rope_theta": 10_000.0,
+                "partial_rotary_factor": partial_rotary_factor,
+            },
+        )
+        linear_scaling_rope = rope_class(config=config).to(torch_device)
+        linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short)
+        linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :])
+        for new_position in range(0, long_input_length, scaling_factor):
+            original_position = int(new_position // scaling_factor)
+            torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :])
+            torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :])
+
+        # Sanity check Dynamic NTK RoPE scaling
+        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
+        # with scaling_factor (or that `inv_freq` decreases)
+        _set_config_rope_params(
+            config,
+            {
+                "rope_type": "dynamic",
+                "factor": scaling_factor,
+                "rope_theta": 10_000.0,
+                "partial_rotary_factor": partial_rotary_factor,
+            },
+        )
+        ntk_scaling_rope = rope_class(config=config).to(torch_device)
+        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short)
+        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(ntk_cos_short, original_cos_short)
+        torch.testing.assert_close(ntk_sin_short, original_sin_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(ntk_cos_long, original_cos_long)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(ntk_sin_long, original_sin_long)
+        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
+
+        # Sanity check Yarn RoPE scaling
+        # Scaling should be over the entire input
+        _set_config_rope_params(
+            config,
+            {
+                "rope_type": "yarn",
+                "factor": scaling_factor,
+                "rope_theta": 10_000.0,
+                "partial_rotary_factor": partial_rotary_factor,
+            },
+        )
+        yarn_scaling_rope = rope_class(config=config).to(torch_device)
+        yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short)
+        yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :])
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_cos_short, original_cos_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_sin_short, original_sin_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_cos_long, original_cos_long)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_sin_long, original_sin_long)
+
+    @require_flash_attn
+    @require_torch_accelerator
+    @pytest.mark.flash_attn_test
+    @is_flaky()
+    @slow
+    def test_flash_attn_2_equivalence(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn:
+                self.skipTest(reason="Model does not support Flash Attention 2")
+
+            # Set seed for deterministic test - ensures reproducible model initialization and inputs
+            set_seed(42)
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(tmpdirname, dtype=torch.bfloat16, attn_implementation="eager")
+                model.to(torch_device)
+
+                dummy_input = inputs_dict[model_class.main_input_name]
+                dummy_input = dummy_input.to(torch_device)
+                outputs = model(dummy_input, output_hidden_states=True)
+                outputs_fa = model_fa(dummy_input, output_hidden_states=True)
+
+                logits = outputs.hidden_states[-1]
+                logits_fa = outputs_fa.hidden_states[-1]
+                torch.testing.assert_close(logits_fa, logits, atol=3e-2, rtol=3e-2)
+
+    def test_causal_lm_can_accept_training_kwargs(self):
+        if not getattr(self.model_tester, "is_training", False):
+            self.skipTest(reason="ModelTester is not configured to run training tests")
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with torch.device(torch_device):
+                model_eager = AutoModelForCausalLM.from_config(config, dtype=torch.float32)
+
+            model_eager.save_pretrained(tmpdir)
+            model = AutoModelForCausalLM.from_pretrained(tmpdir, dtype=torch.float32, device_map=torch_device)
+            inputs_dict["num_items_in_batch"] = torch.tensor(inputs_dict["input_ids"].shape[0])
+            inputs_dict["labels"] = inputs_dict["input_ids"]
+            _ = model(**inputs_dict, return_dict=False)
+
+
+def _config_supports_rope_scaling(config: PreTrainedConfig) -> bool:
+    """Returns whether a certain model config supports RoPE scaling parameterization."""
+    # Has rope_scaling -> model was designed with rope scaling in mind
+    # Has rope_theta (and no rope_scaling) -> probably an older model, but should support rope scaling as well
+    main_config_has_rope = hasattr(config, "rope_parameters")
+    sub_config_has_rope = any(
+        hasattr(getattr(config, sub_config), "rope_parameters") for sub_config in config.sub_configs.keys()
+    )
+    return main_config_has_rope or sub_config_has_rope
+
+
+def _set_config_rope_params(config: PreTrainedConfig, rope_params: dict) -> bool:
+    """Recursively sets RoPE parameters on configs and subconfigs, by duplicating the same RoPE values."""
+    config.rope_parameters = getattr(config, "rope_parameters", {}) or {}
+    config.rope_parameters.update(rope_params)
+
+    if any(name in config.__class__.__name__.lower() for name in ["gemma3", "modernbert"]):
+        config.rope_parameters = {layer_type: config.rope_parameters.copy() for layer_type in config.layer_types}
+
+    for sub_config in config.sub_configs.keys():
+        _set_config_rope_params(getattr(config, sub_config), rope_params)
+    return config
--- a/tests/cli/conftest.py
+++ b/tests/cli/conftest.py
@@ -0,0 +1,41 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import pytest
+from typer.testing import CliRunner
+
+import transformers.cli.transformers
+
+
+@pytest.fixture
+def cli():
+    def _cli_invoke(*args):
+        runner = CliRunner()
+
+        old_out_close = sys.stdout.close
+        old_err_close = sys.stderr.close
+
+        def _noop(*a, **k):
+            return None
+
+        sys.stdout.close = _noop
+        sys.stderr.close = _noop
+        try:
+            return runner.invoke(transformers.cli.transformers.app, list(args), catch_exceptions=False)
+        finally:
+            sys.stdout.close = old_out_close
+            sys.stderr.close = old_err_close
+
+    return _cli_invoke
--- a/tests/cli/test_chat.py
+++ b/tests/cli/test_chat.py
@@ -0,0 +1,46 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import tempfile
+
+from transformers.cli.chat import new_chat_history, parse_generate_flags, save_chat
+
+
+def test_help(cli):
+    output = cli("chat", "--help")
+    assert output.exit_code == 0
+    assert "Chat with a model from the command line." in output.output
+
+
+def test_save_and_clear_chat():
+    with tempfile.TemporaryDirectory() as tmp_path:
+        filename = os.path.join(tmp_path, "chat.json")
+        save_chat(filename, [{"role": "user", "content": "hi"}], {"foo": "bar"})
+        assert os.path.isfile(filename)
+        with open(filename, "r") as f:
+            data = json.load(f)
+            assert data["chat_history"] == [{"role": "user", "content": "hi"}]
+            assert data["settings"] == {"foo": "bar"}
+
+
+def test_new_chat_history():
+    assert new_chat_history() == []
+    assert new_chat_history("prompt") == [{"role": "system", "content": "prompt"}]
+
+
+def test_parse_generate_flags():
+    parsed = parse_generate_flags(["temperature=0.5", "max_new_tokens=10"])
+    assert parsed["temperature"] == 0.5
+    assert parsed["max_new_tokens"] == 10
--- a/tests/cli/test_download.py
+++ b/tests/cli/test_download.py
@@ -0,0 +1,59 @@
+# Copyright 2025-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+
+from transformers.testing_utils import require_torch
+
+
+@require_torch
+def test_cli_download(cli):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # TODO: only necessary for read-only cache systems; replace with a shared helper
+        with unittest.mock.patch.dict(os.environ, {"HF_XET_CACHE": tmpdir}):
+            output = cli("download", "hf-internal-testing/tiny-random-gptj", "--cache-dir", tmpdir)
+            assert output.exit_code == 0
+
+        # check if the model files are downloaded correctly
+        model_dir = os.path.join(tmpdir, "models--hf-internal-testing--tiny-random-gptj")
+        assert os.path.exists(os.path.join(model_dir, "blobs"))
+        assert os.path.exists(os.path.join(model_dir, "refs"))
+        assert os.path.exists(os.path.join(model_dir, "snapshots"))
+
+
+@require_torch
+def test_cli_download_trust_remote(cli, caplog, capsys):
+    caplog.set_level(100000)
+    # ^ hack to avoid an issue happening only in CI. We don't check logs anyway so it's fine.
+    #   Source: https://github.com/pallets/click/issues/824#issuecomment-562581313
+
+    with capsys.disabled():
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # TODO: only necessary for read-only cache systems; replace with a shared helper
+            with unittest.mock.patch.dict(os.environ, {"HF_XET_CACHE": tmpdir}):
+                output = cli(
+                    "download",
+                    "hf-internal-testing/test_dynamic_model_with_tokenizer",
+                    "--trust-remote-code",
+                    "--cache-dir",
+                    tmpdir,
+                )
+                assert output.exit_code == 0
+
+            # check if the model files are downloaded correctly
+            model_dir = os.path.join(tmpdir, "models--hf-internal-testing--test_dynamic_model_with_tokenizer")
+            assert os.path.exists(os.path.join(model_dir, "blobs"))
+            assert os.path.exists(os.path.join(model_dir, "refs"))
+            assert os.path.exists(os.path.join(model_dir, "snapshots"))
--- a/tests/cli/test_serve.py
+++ b/tests/cli/test_serve.py
--- a/tests/cli/test_system.py
+++ b/tests/cli/test_system.py
@@ -0,0 +1,29 @@
+# Copyright 2025-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import __version__
+
+
+def test_cli_env(cli):
+    output = cli("env")
+    assert output.exit_code == 0
+    assert "Python version" in output.output
+    assert "Platform" in output.output
+    assert "Using distributed or parallel set-up in script?" in output.output
+
+
+def test_cli_version(cli):
+    output = cli("version")
+    assert output.exit_code == 0
+    assert output.output.strip() == __version__
--- a/tests/fixtures/audioflamingo3/expected_results_batched.json
+++ b/tests/fixtures/audioflamingo3/expected_results_batched.json
@@ -0,0 +1 @@
+{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other.", "(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world"], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645], [5349, 8, 2014, 13216, 429, 4128, 4157, 3158, 9355, 11, 7578, 404, 4849, 279, 46488, 315, 3691, 323, 4158, 304, 279, 1879, 151645, 151671]]}
--- a/tests/fixtures/audioflamingo3/expected_results_single.json
+++ b/tests/fixtures/audioflamingo3/expected_results_single.json
@@ -0,0 +1 @@
+{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other."], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645]]}
--- a/tests/fixtures/config.json
+++ b/tests/fixtures/config.json
@@ -0,0 +1,4 @@
+{
+    "model_type": "wav2vec2"
+}
+  
--- a/tests/fixtures/dummy-config.json
+++ b/tests/fixtures/dummy-config.json
@@ -0,0 +1,3 @@
+{
+  "model_type": "roberta"
+}
--- a/tests/fixtures/dummy_feature_extractor_config.json
+++ b/tests/fixtures/dummy_feature_extractor_config.json
@@ -0,0 +1,4 @@
+{
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "processor_class": "Wav2Vec2Processor"
+}
--- a/tests/fixtures/empty.txt
+++ b/tests/fixtures/empty.txt
--- a/tests/fixtures/gpt_oss/integration_tests.json
+++ b/tests/fixtures/gpt_oss/integration_tests.json
@@ -0,0 +1,154 @@
+{
+  "device=cuda|quantized=true|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
+    "Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
+    "How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
+  ],
+  "device=cuda|quantized=true|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
+    "Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
+    "How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
+  ],
+  "device=cuda|quantized=true|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
+    "Did not work"
+  ],
+  "device=cuda|quantized=true|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
+    "Did not work"
+  ],
+  "device=cuda|quantized=true|model=120b|kernels=false|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
+    "How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
+  ],
+  "device=cuda|quantized=true|model=120b|kernels=false|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue, I am a language model, and I can help you too!\n\nSure! Here",
+    "How are you? Tell me the name of the president of the United\n\nHello! As of my last update in November 2023, the President of the"
+  ],
+  "device=cuda|quantized=true|model=120b|kernels=true|attn_impl=eager|mode=eval": [
+    "Did not work"
+  ],
+  "device=cuda|quantized=true|model=120b|kernels=true|attn_impl=eager|mode=train": [
+    "Did not work"
+  ],
+  "device=cuda|quantized=true|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
+    "Roses are red, violets are blue, I love you, and I love you too.\nIt sounds like you're looking for",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=cuda|quantized=true|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
+    "Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're looking for",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=cuda|quantized=true|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
+    "Did not work"
+  ],
+  "device=cuda|quantized=true|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
+    "Did not work"
+  ],
+  "device=cuda|quantized=true|model=20b|kernels=false|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=cuda|quantized=true|model=20b|kernels=false|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue, I love you, and I love you too.\n\nIt sounds like you're expressing a",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=cuda|quantized=true|model=20b|kernels=true|attn_impl=eager|mode=eval": [
+    "Did not work"
+  ],
+  "device=cuda|quantized=true|model=20b|kernels=true|attn_impl=eager|mode=train": [
+    "Did not work"
+  ],
+  "device=cuda|quantized=false|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United Kingdom?\n\nThe United Kingdom does not have a president. The head of state is the"
+  ],
+  "device=cuda|quantized=false|model=120b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
+    "Roses are red, violets are blue, I am a language model trained by OpenAI.\n\nI am a large language model",
+    "How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
+  ],
+  "device=cuda|quantized=false|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United Kingdom?\n\nThe United Kingdom does not have a president. The head of state is the"
+  ],
+  "device=cuda|quantized=false|model=120b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
+    "Roses are red, violets are blue, I am a language model trained by OpenAI.\n\nI am a large language model",
+    "How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
+  ],
+  "device=cuda|quantized=false|model=120b|kernels=false|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United States?\n\nAs an AI language model, I do not have personal feelings or emotions,"
+  ],
+  "device=cuda|quantized=false|model=120b|kernels=false|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue, I am a language model, and I can help you with your request.\n\nSure",
+    "How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
+  ],
+  "device=cuda|quantized=false|model=120b|kernels=true|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United States?\n\nAs an AI language model, I do not have personal feelings or emotions,"
+  ],
+  "device=cuda|quantized=false|model=120b|kernels=true|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue, I am a language model, and I can help you with your request.\n\nSure",
+    "How are you? Tell me the name of the president of the United\n\nHello! I'm an AI language model, so I don't have feelings, but I'm here"
+  ],
+  "device=cuda|quantized=false|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=cuda|quantized=false|model=20b|kernels=false|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
+    "Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
+    "How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
+  ],
+  "device=cuda|quantized=false|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=eval": [
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=cuda|quantized=false|model=20b|kernels=true|attn_impl=kernels-community/vllm-flash-attn3|mode=train": [
+    "Roses are red, violets are blue\" (makes sense). But the phrase \"the answer is 3\" is not a",
+    "How are you? Tell me the name of the president of the United States.\" The answer to that is \"Joe Biden.\" The user is asking for the name"
+  ],
+  "device=cuda|quantized=false|model=20b|kernels=false|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=cuda|quantized=false|model=20b|kernels=false|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
+    "How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
+  ],
+  "device=cuda|quantized=false|model=20b|kernels=true|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=cuda|quantized=false|model=20b|kernels=true|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue.\" -> from which we can derive a rule: if we have a red object that is",
+    "How are you? Tell me the name of the president of the United States.\n\nI am an AI language model and I do not have a personal life or"
+  ],
+  "device=xpu|quantized=false|model=20b|kernels=false|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=xpu|quantized=false|model=20b|kernels=false|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=xpu|quantized=false|model=20b|kernels=true|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=xpu|quantized=false|model=20b|kernels=true|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue, I love you, and I love you too!\n\nRoses are red, vio",
+    "How are you? Tell me the name of the president of the United States.\" The assistant should respond with the name of the president. The user is asking for"
+  ],
+  "device=xpu|quantized=false|model=120b|kernels=false|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United\n\nI am an AI language model and do not have personal feelings or emotions. As for"
+  ],
+  "device=xpu|quantized=false|model=120b|kernels=false|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United\n\nI am an AI language model and do not have personal feelings or emotions. As for"
+  ],
+  "device=xpu|quantized=false|model=120b|kernels=true|attn_impl=eager|mode=eval": [
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United Kingdom?\n\nThe United Kingdom does not have a president. The head of state is the"
+  ],
+  "device=xpu|quantized=false|model=120b|kernels=true|attn_impl=eager|mode=train": [
+    "Roses are red, violets are blue,\nI am a language model, not a human being.\n```\n\nThis poem is a",
+    "How are you? Tell me the name of the president of the United Kingdom?\n\nThe United Kingdom does not have a president. The head of state is the"
+  ]
+}
--- a/tests/fixtures/input.txt
+++ b/tests/fixtures/input.txt
@@ -0,0 +1 @@
+Who was Jim Henson ? ||| Jim Henson was a puppeteer
--- a/tests/fixtures/merges.txt
+++ b/tests/fixtures/merges.txt
@@ -0,0 +1,5 @@
+#version: 0.2
+Ġ l
+Ġl o
+Ġlo w
+e r
--- a/tests/fixtures/musicflamingo/expected_results_single.json
+++ b/tests/fixtures/musicflamingo/expected_results_single.json
@@ -0,0 +1 @@
+{"transcriptions": ["This track is an uplifting Eurodance‑style Trance‑Pop anthem that blends the driving four‑on‑the‑floor pulse of classic Eurodance with the soaring, melodic synth work typical of modern trance‑infused pop.  The duration"], "token_ids": [[1986, 3754, 374, 458, 94509, 19461, 98875, 55964, 3528, 1163, 681, 55964, 11598, 55564, 429, 57843, 279, 9842, 3040, 55964, 263, 55964, 1782, 55964, 30449, 27235, 315, 11416, 19461, 98875, 448, 279, 68897, 11, 10581, 52760, 42898, 975, 14260, 315, 6481, 97431, 55964, 13573, 2591, 2420, 13, 220, 576, 8090]]}
--- a/tests/fixtures/parakeet/expected_loss_tdt.json
+++ b/tests/fixtures/parakeet/expected_loss_tdt.json
@@ -0,0 +1,5 @@
+{
+  "num_samples": 2,
+  "expected_mean_loss": 0.528089,
+  "comment": "NeMo reference with sigma=0, HF-style mean reduction (per-sample / target_length, then average). Generated with https://gist.github.com/883ea42bf7d8ce2af42f3055627476a7"
+}
--- a/tests/fixtures/parakeet/expected_results_batch.json
+++ b/tests/fixtures/parakeet/expected_results_batch.json
--- a/tests/fixtures/parakeet/expected_results_batch_tdt.json
+++ b/tests/fixtures/parakeet/expected_results_batch_tdt.json
@@ -0,0 +1,9 @@
+{
+  "transcriptions": [
+    "mister Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
+    "Nor is mister Quilter's manner less interesting than his matter.",
+    "He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind.",
+    "He has grave doubts whether Sir Frederick Leighton's work is really Greek after all, and can discover in it but little of Rocky Ithaca.",
+    "Linnell's pictures are a sort of up guards an atom paintings, and Mason's exquisite idols are as national as a jingo poem. mister Burkett Foster's landscapes smile at one much in the same way that mister Carker used to flash his teeth. And mister John Collier gives his sitter a cheerful slap on the back, before he says, like a shampooer in a Turkish bath Next man"
+  ]
+}
--- a/tests/fixtures/parakeet/expected_results_batch_tdt_timestamp.json
+++ b/tests/fixtures/parakeet/expected_results_batch_tdt_timestamp.json
@@ -0,0 +1,251 @@
+{
+  "transcriptions": [
+    "mister Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
+    "Nor is mister Quilter's manner less interesting than his matter.",
+    "He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind."
+  ],
+  "start_timestamps": [
+    [
+      0.24,
+      0.48,
+      0.64,
+      0.88,
+      1.12,
+      1.36,
+      1.44,
+      1.6,
+      1.76,
+      2.0,
+      2.16,
+      2.24,
+      2.4,
+      2.48,
+      2.56,
+      2.72,
+      2.88,
+      3.04,
+      3.12,
+      3.2800000000000002,
+      3.44,
+      3.6,
+      3.7600000000000002,
+      3.92,
+      4.08,
+      4.24,
+      4.4,
+      4.48,
+      4.72,
+      4.96,
+      5.36,
+      5.6000000000000005
+    ],
+    [
+      0.32,
+      0.64,
+      0.88,
+      1.04,
+      1.2,
+      1.44,
+      1.68,
+      1.84,
+      1.92,
+      2.0,
+      2.16,
+      2.4,
+      2.56,
+      2.72,
+      2.96,
+      3.12,
+      3.36,
+      3.6,
+      3.92,
+      4.16,
+      4.32
+    ],
+    [
+      0.32,
+      0.64,
+      0.72,
+      0.96,
+      1.12,
+      1.36,
+      1.6,
+      1.84,
+      2.08,
+      2.24,
+      2.48,
+      2.64,
+      2.8000000000000003,
+      2.88,
+      3.04,
+      3.2,
+      3.44,
+      3.68,
+      3.84,
+      4.08,
+      4.4,
+      4.5600000000000005,
+      4.72,
+      4.96,
+      5.12,
+      5.36,
+      5.5200000000000005,
+      5.68,
+      5.92,
+      6.16,
+      6.24,
+      6.4,
+      6.5600000000000005,
+      6.72,
+      6.96,
+      7.28,
+      7.6000000000000005,
+      7.92,
+      8.16,
+      8.32,
+      8.48,
+      8.72,
+      8.88,
+      8.96,
+      9.120000000000001,
+      9.28,
+      9.44,
+      9.68,
+      9.76,
+      9.92,
+      10.16,
+      10.24,
+      10.4,
+      10.64,
+      10.88,
+      10.96,
+      11.200000000000001,
+      11.36,
+      11.52,
+      11.84,
+      12.16
+    ]
+  ],
+  "end_timestamps": [
+    [
+      0.48,
+      0.64,
+      0.88,
+      1.12,
+      1.36,
+      1.44,
+      1.6,
+      1.76,
+      1.92,
+      2.16,
+      2.24,
+      2.4,
+      2.48,
+      2.56,
+      2.64,
+      2.88,
+      3.04,
+      3.12,
+      3.12,
+      3.44,
+      3.6,
+      3.7600000000000002,
+      3.92,
+      4.08,
+      4.24,
+      4.4,
+      4.48,
+      4.72,
+      4.96,
+      5.12,
+      5.6000000000000005,
+      5.6000000000000005
+    ],
+    [
+      0.64,
+      0.88,
+      1.04,
+      1.2,
+      1.44,
+      1.68,
+      1.84,
+      1.84,
+      2.0,
+      2.16,
+      2.4,
+      2.56,
+      2.72,
+      2.96,
+      3.12,
+      3.36,
+      3.6,
+      3.92,
+      4.16,
+      4.32,
+      4.32
+    ],
+    [
+      0.64,
+      0.72,
+      0.96,
+      1.12,
+      1.36,
+      1.6,
+      1.84,
+      2.08,
+      2.24,
+      2.48,
+      2.64,
+      2.8000000000000003,
+      2.88,
+      3.04,
+      3.2,
+      3.44,
+      3.68,
+      3.84,
+      3.84,
+      4.4,
+      4.5600000000000005,
+      4.72,
+      4.96,
+      5.12,
+      5.36,
+      5.5200000000000005,
+      5.68,
+      5.92,
+      6.16,
+      6.24,
+      6.4,
+      6.5600000000000005,
+      6.72,
+      6.96,
+      7.28,
+      7.28,
+      7.92,
+      8.16,
+      8.24,
+      8.48,
+      8.72,
+      8.88,
+      8.96,
+      9.120000000000001,
+      9.200000000000001,
+      9.44,
+      9.68,
+      9.76,
+      9.92,
+      10.16,
+      10.24,
+      10.4,
+      10.64,
+      10.88,
+      10.96,
+      11.200000000000001,
+      11.36,
+      11.52,
+      11.84,
+      12.16,
+      12.16
+    ]
+  ]
+}
--- a/tests/fixtures/parakeet/expected_results_single.json
+++ b/tests/fixtures/parakeet/expected_results_single.json
@@ -0,0 +1 @@
+{"transcriptions": ["mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"], "scores": [-0.08922013640403748], "token_ids": [[1024, 1024, 1024, 1024, 1024, 1024, 19, 37, 132, 1024, 1024, 264, 128, 1024, 1024, 1024, 132, 1024, 58, 1024, 5, 645, 1024, 1000, 82, 52, 1024, 34, 1024, 5, 19, 68, 1007, 52, 1024, 235, 1024, 388, 1024, 27, 1024, 25, 1024, 56, 1024, 103, 1024, 1024, 727, 112, 1024, 22, 1024, 56, 1006, 1009, 405, 1024, 1024, 217, 1024, 1024, 95, 1003, 1024, 133, 1006, 1024, 1024, 1024, 1024, 1024, 1024, 1024]]}
--- a/tests/fixtures/parakeet/expected_results_single_tdt.json
+++ b/tests/fixtures/parakeet/expected_results_single_tdt.json
@@ -0,0 +1,5 @@
+{
+  "transcriptions": [
+    "mister Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+  ]
+}
--- a/tests/fixtures/parakeet/expected_tdt_loss.json
+++ b/tests/fixtures/parakeet/expected_tdt_loss.json
@@ -0,0 +1,43 @@
+{
+  "seed": 42,
+  "batch_size": 2,
+  "max_t": 8,
+  "max_u": 4,
+  "vocab_size": 5,
+  "durations": [
+    0,
+    1,
+    2,
+    3,
+    4
+  ],
+  "targets": [
+    [
+      4,
+      2,
+      2,
+      1
+    ],
+    [
+      0,
+      4,
+      2,
+      4
+    ]
+  ],
+  "logit_lengths": [
+    8,
+    7
+  ],
+  "target_lengths": [
+    4,
+    3
+  ],
+  "expected_loss_sum": 21.978166580200195,
+  "expected_loss_mean": 3.124553918838501,
+  "expected_loss_none": [
+    12.923372268676758,
+    9.054794311523438
+  ],
+  "expected_loss_mean_sigma_0p05": 3.1921849250793457
+}
--- a/tests/fixtures/preprocessor_config.json
+++ b/tests/fixtures/preprocessor_config.json
@@ -0,0 +1,4 @@
+{
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "processor_class": "Wav2Vec2Processor"
+}
--- a/tests/fixtures/sample_text.txt
+++ b/tests/fixtures/sample_text.txt
@@ -0,0 +1,33 @@
+This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
+Text should be one-sentence-per-line, with empty lines between documents.
+This sample text is public domain and was randomly selected from Project Guttenberg.
+
+The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
+Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
+Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
+"Cass" Beard had risen early that morning, but not with a view to discovery.
+A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
+The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
+This was nearly opposite.
+Mr. Cassius crossed the highway, and stopped suddenly.
+Something glittered in the nearest red pool before him.
+Gold, surely!
+But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
+Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
+Like most of his fellow gold-seekers, Cass was superstitious.
+
+The fountain of classic wisdom, Hypatia herself.
+As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
+From my youth I felt in me a soul above the matter-entangled herd.
+She revealed to me the glorious fact, that I am a spark of Divinity itself.
+A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
+There is a philosophic pleasure in opening one's treasures to the modest young.
+Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
+Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
+but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
+Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
+His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
+while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
+At last they reached the quay at the opposite end of the street;
+and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
+He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
--- a/tests/fixtures/sample_text_no_unicode.txt
+++ b/tests/fixtures/sample_text_no_unicode.txt
@@ -0,0 +1,32 @@
+Text should be one-sentence-per-line, with empty lines between documents.
+This sample text is public domain and was randomly selected from Project Guttenberg.
+
+The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
+Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
+Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
+"Cass" Beard had risen early that morning, but not with a view to discovery.
+A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
+The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
+This was nearly opposite.
+Mr. Cassius crossed the highway, and stopped suddenly.
+Something glittered in the nearest red pool before him.
+Gold, surely!
+But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
+Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
+Like most of his fellow gold-seekers, Cass was superstitious.
+
+The fountain of classic wisdom, Hypatia herself.
+As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
+From my youth I felt in me a soul above the matter-entangled herd.
+She revealed to me the glorious fact, that I am a spark of Divinity itself.
+A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
+There is a philosophic pleasure in opening one's treasures to the modest young.
+Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
+Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
+but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
+Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
+His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
+while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
+At last they reached the quay at the opposite end of the street;
+and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
+He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
--- a/tests/fixtures/spiece.model
+++ b/tests/fixtures/spiece.model
--- a/tests/fixtures/test_entity_vocab.json
+++ b/tests/fixtures/test_entity_vocab.json
@@ -0,0 +1 @@
+{"[MASK]": 0, "[UNK]": 1, "[PAD]": 2, "DUMMY": 3, "DUMMY2": 4, "[MASK2]": 5}
--- a/tests/fixtures/test_sentencepiece.model
+++ b/tests/fixtures/test_sentencepiece.model
--- a/tests/fixtures/test_sentencepiece_bpe.model
+++ b/tests/fixtures/test_sentencepiece_bpe.model
--- a/tests/fixtures/test_sentencepiece_bpe_char.model
+++ b/tests/fixtures/test_sentencepiece_bpe_char.model
--- a/tests/fixtures/test_sentencepiece_no_bos.model
+++ b/tests/fixtures/test_sentencepiece_no_bos.model
--- a/tests/fixtures/test_sentencepiece_with_bytefallback.model
+++ b/tests/fixtures/test_sentencepiece_with_bytefallback.model
--- a/tests/fixtures/tests_samples/.gitignore
+++ b/tests/fixtures/tests_samples/.gitignore
@@ -0,0 +1,6 @@
+cache*
+temp*
+!*.txt
+!*.tsv
+!*.json
+!.gitignore 
--- a/tests/fixtures/tests_samples/COCO/000000004016.png
+++ b/tests/fixtures/tests_samples/COCO/000000004016.png
--- a/tests/fixtures/tests_samples/COCO/000000039769.png
+++ b/tests/fixtures/tests_samples/COCO/000000039769.png
--- a/tests/fixtures/tests_samples/COCO/coco_annotations.txt
+++ b/tests/fixtures/tests_samples/COCO/coco_annotations.txt
@@ -0,0 +1 @@
+[{"segmentation": [[333.96, 175.14, 338.26, 134.33, 342.55, 95.67, 348.99, 79.57, 368.32, 80.64, 371.54, 91.38, 364.03, 106.41, 356.51, 145.07, 351.14, 166.55, 350.07, 184.8, 345.77, 185.88, 332.89, 178.36, 332.89, 172.99]], "area": 2120.991099999999, "iscrowd": 0, "image_id": 39769, "bbox": [332.89, 79.57, 38.65, 106.31], "category_id": 75, "id": 1108446}, {"segmentation": [[44.03, 86.01, 112.75, 74.2, 173.96, 77.42, 175.03, 89.23, 170.74, 98.9, 147.11, 102.12, 54.77, 119.3, 53.69, 119.3, 44.03, 113.93, 41.88, 94.6, 41.88, 94.6]], "area": 4052.607, "iscrowd": 0, "image_id": 39769, "bbox": [41.88, 74.2, 133.15, 45.1], "category_id": 75, "id": 1110067}, {"segmentation": [[1.08, 473.53, 633.17, 473.53, 557.66, 376.45, 535.01, 366.74, 489.71, 305.26, 470.29, 318.2, 456.27, 351.64, 413.12, 363.51, 376.45, 358.11, 348.4, 350.56, 363.51, 331.15, 357.03, 288.0, 353.8, 257.8, 344.09, 190.92, 333.3, 177.98, 345.17, 79.82, 284.76, 130.52, 265.35, 151.01, 308.49, 189.84, 317.12, 215.73, 293.39, 243.78, 269.66, 212.49, 235.15, 199.55, 214.65, 193.08, 187.69, 217.89, 159.64, 278.29, 135.91, 313.89, 169.35, 292.31, 203.87, 281.53, 220.04, 292.31, 220.04, 307.42, 175.82, 345.17, 155.33, 360.27, 105.71, 363.51, 85.21, 374.29, 74.43, 366.74, 70.11, 465.98, 42.07, 471.37, 33.44, 457.35, 34.52, 414.2, 29.12, 368.9, 9.71, 291.24, 46.38, 209.26, 99.24, 128.36, 131.6, 107.87, 50.7, 117.57, 40.99, 103.55, 40.99, 85.21, 60.4, 77.66, 141.3, 70.11, 173.66, 72.27, 174.74, 92.76, 204.94, 72.27, 225.44, 62.56, 262.11, 56.09, 292.31, 53.93, 282.61, 81.98, 298.79, 96.0, 310.65, 102.47, 348.4, 74.43, 373.21, 81.98, 430.38, 35.6, 484.31, 23.73, 540.4, 46.38, 593.26, 66.88, 638.56, 80.9, 632.09, 145.62, 581.39, 118.65, 543.64, 130.52, 533.93, 167.19, 512.36, 197.39, 498.34, 218.97, 529.62, 253.48, 549.03, 273.98, 584.63, 276.13, 587.87, 293.39, 566.29, 305.26, 531.78, 298.79, 549.03, 319.28, 576.0, 358.11, 560.9, 376.45, 639.64, 471.37, 639.64, 2.16, 1.08, 0.0]], "area": 176277.55269999994, "iscrowd": 0, "image_id": 39769, "bbox": [1.08, 0.0, 638.56, 473.53], "category_id": 63, "id": 1605237}, {"segmentation": [[1.07, 1.18, 640.0, 3.33, 638.93, 472.59, 4.3, 479.03]], "area": 301552.6694999999, "iscrowd": 0, "image_id": 39769, "bbox": [1.07, 1.18, 638.93, 477.85], "category_id": 65, "id": 1612051}, {"segmentation": [[138.75, 319.38, 148.75, 294.38, 165.0, 246.87, 197.5, 205.63, 247.5, 203.13, 268.75, 216.88, 280.0, 239.38, 293.75, 244.38, 303.75, 241.88, 307.5, 228.13, 318.75, 220.63, 315.0, 200.63, 291.25, 171.88, 265.0, 156.88, 258.75, 148.13, 262.5, 135.63, 282.5, 123.13, 292.5, 115.63, 311.25, 108.13, 313.75, 106.88, 296.25, 93.13, 282.5, 84.38, 292.5, 64.38, 288.75, 60.63, 266.25, 54.38, 232.5, 63.12, 206.25, 70.63, 170.0, 100.63, 136.25, 114.38, 101.25, 138.13, 56.25, 194.38, 27.5, 259.38, 17.5, 299.38, 32.5, 378.13, 31.25, 448.13, 41.25, 469.38, 66.25, 466.88, 70.0, 419.38, 71.25, 391.88, 77.5, 365.63, 113.75, 364.38, 145.0, 360.63, 168.75, 349.38, 191.25, 330.63, 212.5, 319.38, 223.75, 305.63, 206.25, 286.88, 172.5, 288.13]], "area": 53301.618749999994, "iscrowd": 0, "image_id": 39769, "bbox": [17.5, 54.38, 301.25, 415.0], "category_id": 17, "id": 2190839}, {"segmentation": [[543.75, 136.88, 570.0, 114.38, 591.25, 123.13, 616.25, 140.63, 640.0, 143.13, 636.25, 124.37, 605.0, 103.13, 640.0, 103.13, 633.75, 86.88, 587.5, 73.13, 548.75, 49.38, 505.0, 35.63, 462.5, 25.63, 405.0, 48.13, 362.5, 111.88, 347.5, 179.38, 355.0, 220.63, 356.25, 230.63, 365.0, 264.38, 358.75, 266.88, 358.75, 270.63, 356.25, 291.88, 356.25, 325.63, 355.0, 338.13, 350.0, 348.13, 365.0, 354.38, 396.25, 351.88, 423.75, 355.63, 446.25, 350.63, 460.0, 345.63, 462.5, 321.88, 468.75, 306.88, 481.25, 299.38, 516.25, 341.88, 536.25, 368.13, 570.0, 369.38, 578.75, 359.38, 555.0, 330.63, 532.5, 298.13, 563.75, 299.38, 582.5, 298.13, 586.25, 286.88, 578.75, 278.13, 548.75, 269.38, 525.0, 256.88, 505.0, 206.88, 536.25, 161.88, 540.0, 149.38]], "area": 59700.95625, "iscrowd": 0, "image_id": 39769, "bbox": [347.5, 25.63, 292.5, 343.75], "category_id": 17, "id": 2190842}]
--- a/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png
+++ b/tests/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png
--- a/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt
+++ b/tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt
@@ -0,0 +1 @@
+[{"id": 8222595, "category_id": 17, "iscrowd": 0, "bbox": [18, 54, 301, 415], "area": 53306}, {"id": 8225432, "category_id": 17, "iscrowd": 0, "bbox": [349, 26, 291, 343], "area": 59627}, {"id": 8798150, "category_id": 63, "iscrowd": 0, "bbox": [1, 0, 639, 474], "area": 174579}, {"id": 14466198, "category_id": 75, "iscrowd": 0, "bbox": [42, 74, 133, 45], "area": 4068}, {"id": 12821912, "category_id": 75, "iscrowd": 0, "bbox": [333, 80, 38, 106], "area": 2118}, {"id": 10898909, "category_id": 93, "iscrowd": 0, "bbox": [0, 0, 640, 480], "area": 2750}]
--- a/tests/fixtures/tests_samples/GermEval/dev.txt
+++ b/tests/fixtures/tests_samples/GermEval/dev.txt
@@ -0,0 +1,202 @@
+Gleich O
+darauf O
+entwirft O
+er O
+seine O
+Selbstdarstellung O
+" O
+Ecce B-OTH
+homo I-OTH
+" O
+in O
+enger O
+Auseinandersetzung O
+mit O
+diesem O
+Bild O
+Jesu B-PER
+. O
+
+1980 O
+kam O
+der O
+Crown B-OTH
+als O
+Versuch O
+von O
+Toyota B-ORG
+, O
+sich O
+in O
+der O
+Oberen O
+Mittelklasse O
+zu O
+etablieren O
+, O
+auch O
+nach O
+Deutschland B-LOC
+. O
+
+– O
+4:26 O
+# O
+Sometime B-OTH
+Ago/La I-OTH
+Fiesta I-OTH
+– O
+23:18 O
+Alle O
+Stücke O
+wurden O
+von O
+Corea B-PER
+komponiert O
+mit O
+Ausnahme O
+der O
+einleitenden O
+Improvisation O
+zu O
+Sometime B-OTH
+Ago I-OTH
+. O
+
+Bis O
+2013 O
+steigen O
+die O
+Mittel O
+aus O
+dem O
+EU-Budget B-ORGpart
+auf O
+rund O
+120 O
+Millionen O
+Euro B-OTH
+. O
+
+Daraus O
+entwickelte O
+sich O
+im O
+Rokoko B-OTH
+die O
+Sitte O
+des O
+gemeinsamen O
+Weinens O
+im O
+Theater O
+, O
+das O
+die O
+Standesgrenzen O
+innerhalb O
+des O
+Publikums O
+überbrücken O
+sollte O
+. O
+
+Die O
+Spinne O
+hatte O
+sie O
+mit O
+Seidenfäden O
+an O
+ihrem O
+Schwanz O
+gefesselt O
+und O
+nach O
+oben O
+gezogen O
+. O
+
+In O
+Deutschland B-LOC
+ist O
+nach O
+StGB O
+eine O
+Anwerbung O
+für O
+die O
+Fremdenlegion O
+strafbar O
+. O
+
+Am O
+Donnerstag O
+wird O
+sich O
+zeigen O
+, O
+ob O
+die O
+Idee O
+der O
+DLR-Forscher B-ORGpart
+funktioniert O
+. O
+
+Der O
+sechste O
+Lauf O
+der O
+ADAC B-ORG
+GT I-ORG
+Mastersstand O
+ganz O
+klar O
+im O
+Mittelpunkt O
+des O
+Motorsport-Wochenendes O
+auf O
+dem O
+Eurospeedway B-ORG
+Lausitz I-ORG
+. O
+
+Nach O
+den O
+schwächeren O
+Vorgaben O
+der O
+Wall B-ORG
+Street I-ORG
+vom O
+Vortag O
+setzten O
+die O
+deutschen B-LOCderiv
+Standardwerte O
+ihren O
+Konsolidierungskurs O
+fort O
+. O
+
+Kolb B-PER
+war O
+seit O
+1986 O
+im O
+Turnverein O
+als O
+Leiter O
+tätig O
+, O
+darunter O
+elf O
+Jahre O
+als O
+Hauptleiter O
+in O
+der O
+Männerriege O
+. O
--- a/tests/fixtures/tests_samples/GermEval/labels.txt
+++ b/tests/fixtures/tests_samples/GermEval/labels.txt
@@ -0,0 +1,25 @@
+B-LOC
+B-LOCderiv
+B-LOCpart
+B-ORG
+B-ORGderiv
+B-ORGpart
+B-OTH
+B-OTHderiv
+B-OTHpart
+B-PER
+B-PERderiv
+B-PERpart
+I-LOC
+I-LOCderiv
+I-LOCpart
+I-ORG
+I-ORGderiv
+I-ORGpart
+I-OTH
+I-OTHderiv
+I-OTHpart
+I-PER
+I-PERderiv
+I-PERpart
+O
--- a/tests/fixtures/tests_samples/GermEval/train.txt
+++ b/tests/fixtures/tests_samples/GermEval/train.txt
@@ -0,0 +1,200 @@
+Schartau B-PER
+sagte O
+dem O
+" O
+Tagesspiegel B-ORG
+" O
+vom O
+Freitag O
+, O
+Fischer B-PER
+sei O
+" O
+in O
+einer O
+Weise O
+aufgetreten O
+, O
+die O
+alles O
+andere O
+als O
+überzeugend O
+war O
+" O
+. O
+
+Firmengründer O
+Wolf B-PER
+Peter I-PER
+Bree I-PER
+arbeitete O
+Anfang O
+der O
+siebziger O
+Jahre O
+als O
+Möbelvertreter O
+, O
+als O
+er O
+einen O
+fliegenden O
+Händler O
+aus O
+dem O
+Libanon B-LOC
+traf O
+. O
+
+Ob O
+sie O
+dabei O
+nach O
+dem O
+Runden O
+Tisch O
+am O
+23. O
+April O
+in O
+Berlin B-LOC
+durch O
+ein O
+pädagogisches O
+Konzept O
+unterstützt O
+wird O
+, O
+ist O
+allerdings O
+zu O
+bezweifeln O
+. O
+
+Bayern B-ORG
+München I-ORG
+ist O
+wieder O
+alleiniger O
+Top- O
+Favorit O
+auf O
+den O
+Gewinn O
+der O
+deutschen B-LOCderiv
+Fußball-Meisterschaft O
+. O
+
+Dabei O
+hätte O
+der O
+tapfere O
+Schlussmann O
+allen O
+Grund O
+gehabt O
+, O
+sich O
+viel O
+früher O
+aufzuregen O
+. O
+
+ARD-Programmchef B-ORGpart
+Günter B-PER
+Struve I-PER
+war O
+wegen O
+eines O
+vierwöchigen O
+Urlaubs O
+für O
+eine O
+Stellungnahme O
+nicht O
+erreichbar O
+. O
+
+Alternativ O
+sollten O
+sich O
+die O
+Restaurantbetreiber O
+aus O
+Sicht O
+der O
+Solingerin B-LOCderiv
+zu O
+längeren O
+Öffnungszeiten O
+verpflichten O
+, O
+um O
+wartende O
+Kunden O
+aufzunehmen O
+. O
+
+Die O
+Deutsche B-ORG
+Flugsicherung I-ORG
+( O
+DFS B-ORG
+) O
+beschloss O
+ein O
+Flugverbot O
+für O
+alle O
+internationalen O
+Flughäfen O
+mit O
+Ausnahme O
+der O
+beiden O
+Berliner B-LOCderiv
+Flughäfen O
+bis O
+2.00 O
+Uhr O
+nachts O
+. O
+
+New O
+Small O
+Family O
+mit O
+E-Motor O
+: O
+Studie O
+E-Up O
+! O
+
+Eine O
+Schwachstelle O
+war O
+beispielsweise O
+der O
+Spiegelkasten O
+. O
+
+Denn O
+durch O
+den O
+Einsatz O
+moderner O
+Fahrzeugtechnik O
+( O
+Dieseltriebwagen O
+) O
+und O
+schalldämmender O
+Fenster O
+entsteht O
+keine O
+Einschränkung O
+der O
+Wohnqualität O
+. O
--- a/tests/fixtures/tests_samples/MRPC/dev.csv
+++ b/tests/fixtures/tests_samples/MRPC/dev.csv
@@ -0,0 +1,7 @@
+label,sentence1,sentence2
+equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
+not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
+not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
+equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
+equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
--- a/tests/fixtures/tests_samples/MRPC/dev.tsv
+++ b/tests/fixtures/tests_samples/MRPC/dev.tsv
@@ -0,0 +1,7 @@
+Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
--- a/tests/fixtures/tests_samples/MRPC/train.csv
+++ b/tests/fixtures/tests_samples/MRPC/train.csv
@@ -0,0 +1,7 @@
+label,sentence1,sentence2
+equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
+not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
+not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
+equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
+equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
--- a/tests/fixtures/tests_samples/MRPC/train.tsv
+++ b/tests/fixtures/tests_samples/MRPC/train.tsv
@@ -0,0 +1,7 @@
+Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
--- a/tests/fixtures/tests_samples/SQUAD/sample.json
+++ b/tests/fixtures/tests_samples/SQUAD/sample.json
@@ -0,0 +1,201 @@
+{
+    "version": 2.0,
+    "data": [
+        {
+            "id": "56ddde6b9a695914005b9628",
+            "question": "In what country is Normandy located?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [
+                    159,
+                    159,
+                    159,
+                    159
+                ],
+                "text": [
+                    "France",
+                    "France",
+                    "France",
+                    "France"
+                ]
+            }
+        },
+        {
+            "id": "56ddde6b9a695914005b9629",
+            "question": "When were the Normans in Normandy?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [
+                    94,
+                    87,
+                    94,
+                    94
+                ],
+                "text": [
+                    "10th and 11th centuries",
+                    "in the 10th and 11th centuries",
+                    "10th and 11th centuries",
+                    "10th and 11th centuries"
+                ]
+            }
+        },
+        {
+            "id": "56ddde6b9a695914005b962a",
+            "question": "From which countries did the Norse originate?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [
+                    256,
+                    256,
+                    256,
+                    256
+                ],
+                "text": [
+                    "Denmark, Iceland and Norway",
+                    "Denmark, Iceland and Norway",
+                    "Denmark, Iceland and Norway",
+                    "Denmark, Iceland and Norway"
+                ]
+            }
+        },
+        {
+            "id": "5ad39d53604f3c001a3fe8d3",
+            "question": "Who did King Charles III swear fealty to?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "5ad39d53604f3c001a3fe8d4",
+            "question": "When did the Frankish identity emerge?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "56dddf4066d3e219004dad5f",
+            "question": "Who was the duke in the battle of Hastings?",
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands.",
+            "answers": {
+                "answer_start": [
+                    1022,
+                    1022,
+                    1022
+                ],
+                "text": [
+                    "William the Conqueror",
+                    "William the Conqueror",
+                    "William the Conqueror"
+                ]
+            }
+        },
+        {
+            "id": "5ad3a266604f3c001a3fea2b",
+            "question": "What principality did William the conquerer found?",
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "56e16182e3433e1400422e28",
+            "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm.",
+            "answers": {
+                "answer_start": [
+                    0,
+                    0,
+                    0
+                ],
+                "text": [
+                    "Computational complexity theory",
+                    "Computational complexity theory",
+                    "Computational complexity theory"
+                ]
+            }
+        },
+        {
+            "id": "5ad5316b5b96ef001a10ab76",
+            "question": "What is a manual application of mathematical steps?",
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "56e16839cd28a01900c67887",
+            "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [
+                    46,
+                    49,
+                    46
+                ],
+                "text": [
+                    "if its solution requires significant resources",
+                    "its solution requires significant resources",
+                    "if its solution requires significant resources"
+                ]
+            }
+        },
+        {
+            "id": "56e16839cd28a01900c67888",
+            "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [
+                    176,
+                    176,
+                    176
+                ],
+                "text": [
+                    "mathematical models of computation",
+                    "mathematical models of computation",
+                    "mathematical models of computation"
+                ]
+            }
+        },
+        {
+            "id": "56e16839cd28a01900c67889",
+            "question": "What are two basic primary resources used to guage complexity?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [
+                    305,
+                    305,
+                    305
+                ],
+                "text": [
+                    "time and storage",
+                    "time and storage",
+                    "time and storage"
+                ]
+            }
+        },
+        {
+            "id": "5ad532575b96ef001a10ab7f",
+            "question": "What unit is measured to determine circuit simplicity?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "5ad532575b96ef001a10ab80",
+            "question": "What number is used in perpendicular computing?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        }
+    ]
+}
--- a/tests/fixtures/tests_samples/STS-B/dev.tsv
+++ b/tests/fixtures/tests_samples/STS-B/dev.tsv
@@ -0,0 +1,10 @@
+index	genre	filename	year	old_index	source1	source2	sentence1	sentence2	score
+0	main-captions	MSRvid	2012test	0000	none	none	A man with a hard hat is dancing.	A man wearing a hard hat is dancing.	5.000
+1	main-captions	MSRvid	2012test	0002	none	none	A young child is riding a horse.	A child is riding a horse.	4.750
+2	main-captions	MSRvid	2012test	0003	none	none	A man is feeding a mouse to a snake.	The man is feeding a mouse to the snake.	5.000
+3	main-captions	MSRvid	2012test	0007	none	none	A woman is playing the guitar.	A man is playing guitar.	2.400
+4	main-captions	MSRvid	2012test	0008	none	none	A woman is playing the flute.	A man is playing a flute.	2.750
+5	main-captions	MSRvid	2012test	0010	none	none	A woman is cutting an onion.	A man is cutting onions.	2.615
+6	main-captions	MSRvid	2012test	0015	none	none	A man is erasing a chalk board.	The man is erasing the chalk board.	5.000
+7	main-captions	MSRvid	2012test	0023	none	none	A woman is carrying a boy.	A woman is carrying her baby.	2.333
+8	main-captions	MSRvid	2012test	0027	none	none	Three men are playing guitars.	Three men are on stage playing guitars.	3.750
--- a/tests/fixtures/tests_samples/STS-B/train.tsv
+++ b/tests/fixtures/tests_samples/STS-B/train.tsv
@@ -0,0 +1,10 @@
+index	genre	filename	year	old_index	source1	source2	sentence1	sentence2	score
+0	main-captions	MSRvid	2012test	0001	none	none	A plane is taking off.	An air plane is taking off.	5.000
+1	main-captions	MSRvid	2012test	0004	none	none	A man is playing a large flute.	A man is playing a flute.	3.800
+2	main-captions	MSRvid	2012test	0005	none	none	A man is spreading shreded cheese on a pizza.	A man is spreading shredded cheese on an uncooked pizza.	3.800
+3	main-captions	MSRvid	2012test	0006	none	none	Three men are playing chess.	Two men are playing chess.	2.600
+4	main-captions	MSRvid	2012test	0009	none	none	A man is playing the cello.	A man seated is playing the cello.	4.250
+5	main-captions	MSRvid	2012test	0011	none	none	Some men are fighting.	Two men are fighting.	4.250
+6	main-captions	MSRvid	2012test	0012	none	none	A man is smoking.	A man is skating.	0.500
+7	main-captions	MSRvid	2012test	0013	none	none	The man is playing the piano.	The man is playing the guitar.	1.600
+8	main-captions	MSRvid	2012test	0014	none	none	A man is playing on a guitar and singing.	A woman is playing an acoustic guitar and singing.	2.200
--- a/tests/fixtures/tests_samples/conll/sample.json
+++ b/tests/fixtures/tests_samples/conll/sample.json
@@ -0,0 +1,10 @@
+{"words": ["He", "was", "the", "27th", "pitcher", "used", "by", "the", "Angels", "this", "season", ",", "tying", "a", "major-league", "record", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "B-ORG", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["CHICAGO", "AT", "ATLANTA"], "ner": ["B-ORG", "O", "B-LOC"]}
+{"words": ["President", "Bill", "Clinton", "earlier", "this", "month", "invoked", "special", "powers", "to", "appoint", "Fowler", "during", "the", "congressional", "recess", "because", "the", "Senate", "delayed", "confirming", "his", "nomination", "."], "ner": ["O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O", "O", "O", "O", "B-ORG", "O", "O", "O", "O", "O"]}
+{"words": ["goals", "for", ",", "goals", "against", ",", "points", ")", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["\"", "It", "is", "one", "step", "short", "of", "an", "emergency", "situation", ",", "\"", "a", "police", "spokesman", "said", "via", "telephone", "from", "a", "command", "post", "in", "the", "bush", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["U.S.", "Ambassador", "Myles", "Frechette", "applauded", "the", "move", ",", "saying", "it", "could", "prompt", "the", "Clinton", "administration", "to", "remove", "Colombia", "from", "a", "list", "of", "outcast", "nations", "that", "have", "failed", "to", "cooperate", "in", "U.S.", "counternarcotics", "efforts", "."], "ner": ["B-LOC", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O", "O", "O"]}
+{"words": ["Halftime"], "ner": ["O"]}
+{"words": ["It", "has", "manufacturing", "plants", "in", "San", "Diego", ";", "Creedmoor", ",", "N.C.", ";", "Hampshire", ",", "England", ";", "and", "Tijuana", ",", "Mexico", ",", "and", "distributes", "its", "prodcuts", "in", "more", "than", "120", "countries", "."], "ner": ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "O", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["Scotland", "manager", "Craig", "Brown", "said", "on", "Thursday", ":", "\"", "I", "'ve", "watched", "Duncan", "Ferguson", "in", "action", "twice", "recently", "and", "he", "'s", "bang", "in", "form", "."], "ner": ["B-LOC", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["Clinton", "flew", "in", "by", "helicopter", "from", "Michigan", "City", ",", "Indiana", ",", "after", "ending", "a", "four-day", ",", "559-mile", "trip", "aboard", "a", "campaign", "train", "from", "Washington", "."], "ner": ["B-PER", "O", "O", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O"]}
--- a/tests/fixtures/tests_samples/swag/sample.json
+++ b/tests/fixtures/tests_samples/swag/sample.json
@@ -0,0 +1,10 @@
+{"ending0": "passes by walking down the street playing their instruments.", "ending1": "has heard approaching them.", "ending2": "arrives and they're outside dancing and asleep.", "ending3": "turns the lead singer watches the performance.", "label": 0, "sent1": "Members of the procession walk down the street holding small horn brass instruments.", "sent2": "A drum line"}
+{"ending0": "are playing ping pong and celebrating one left each in quick.", "ending1": "wait slowly towards the cadets.", "ending2": "continues to play as well along the crowd along with the band being interviewed.", "ending3": "continue to play marching, interspersed.", "label": 3, "sent1": "A drum line passes by walking down the street playing their instruments.", "sent2": "Members of the procession"}
+{"ending0": "pay the other coaches to cheer as people this chatter dips in lawn sheets.", "ending1": "walk down the street holding small horn brass instruments.", "ending2": "is seen in the background.", "ending3": "are talking a couple of people playing a game of tug of war.", "label": 1, "sent1": "A group of members in green uniforms walks waving flags.", "sent2": "Members of the procession"}
+{"ending0": "are playing ping pong and celebrating one left each in quick.", "ending1": "wait slowly towards the cadets.", "ending2": "makes a square call and ends by jumping down into snowy streets where fans begin to take their positions.", "ending3": "play and go back and forth hitting the drums while the audience claps for them.", "label": 3, "sent1": "A drum line passes by walking down the street playing their instruments.", "sent2": "Members of the procession"}
+{"ending0": "finishes the song and lowers the instrument.", "ending1": "hits the saxophone and demonstrates how to properly use the racquet.", "ending2": "finishes massage the instrument again and continues.", "ending3": "continues dancing while the man gore the music outside while drums.", "label": 0, "sent1": "The person plays a song on the violin.", "sent2": "The man"}
+{"ending0": "finishes playing then marches their tenderly.", "ending1": "walks in frame and rubs on his hands, and then walks into a room.", "ending2": "continues playing guitar while moving from the camera.", "ending3": "plays a song on the violin.", "label": 3, "sent1": "The person holds up the violin to his chin and gets ready.", "sent2": "The person"}
+{"ending0": "examines the instrument in his hand.", "ending1": "stops playing the drums and waves over the other boys.", "ending2": "lights the cigarette and sticks his head in.", "ending3": "drags off the vacuum.", "label": 0, "sent1": "A person retrieves an instrument from a closet.", "sent2": "The man"}
+{"ending0": "studies a picture of the man playing the violin.", "ending1": "holds up the violin to his chin and gets ready.", "ending2": "stops to speak to the camera again.", "ending3": "puts his arm around the man and backs away.", "label": 1, "sent1": "The man examines the instrument in his hand.", "sent2": "The person"}
+{"ending0": "hands her another phone.", "ending1": "takes the drink, then holds it.", "ending2": "looks off then looks at someone.", "ending3": "stares blearily down at the floor.", "label": 3, "sent1": "Someone walks over to the radio.", "sent2": "Someone"}
+{"ending0": "looks off then looks at someone.", "ending1": "hands her another phone.", "ending2": "takes the drink, then holds it.", "ending3": "turns on a monitor.", "label": 3, "sent1": "Someone walks over to the radio.", "sent2": "Someone"}
--- a/tests/fixtures/tests_samples/wiki_text/wiki_00
+++ b/tests/fixtures/tests_samples/wiki_text/wiki_00
@@ -0,0 +1,251 @@
+<doc id="12" url="https://en.wikipedia.org/wiki?curid=12" title="Anarchism">
+Anarchism
+
+Anarchism is a political philosophy and movement that rejects all involuntary, coercive forms of hierarchy. It radically calls for the abolition of the state which it holds to be undesirable, unnecessary, and harmful.
+
+The history of anarchism stretches back to prehistory, when humans lived in anarchistic societies long before the establishment of formal states, realms or empires. With the rise of organised hierarchical bodies, skepticism toward authority also rose, but it was not until the 19th century that a self-conscious political movement emerged. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in worker's struggles for emancipation. Various anarchist schools of thought formed during this period.
+
+Anarchists took part in several revolutions, most notably in the Spanish Civil War, where they were crushed along with the alliance to restore the Second Republic by the fascist forces of the Nationalist faction and its foreign allies in Nazi Germany, Fascist Italy, Portuguese Dictatorship and the Catholic Church in 1939, marking the end of the classical era of anarchism. In the last decades of the 20th century and into the 21st century, the anarchist movement has been resurgent once more.
+
+Anarchism employs various tactics in order to meet its ideal ends; these can be broadly separated into revolutionary and evolutionary tactics. There is significant overlap between the two, which are merely descriptive. Revolutionary tactics aim to bring down authority and state, and have taken a violent turn in the past. Evolutionary tactics aim to prefigure what an anarchist society would be like. Anarchist thought, criticism, and praxis has played a part in diverse areas of human society.
+
+The etymological origin of "anarchism" is from the Ancient Greek "anarkhia", meaning "without a ruler", composed of the prefix "an-" (i.e. "without") and the word "arkhos" (i.e. "leader" or "ruler"). The suffix "-ism" denotes the ideological current that favours anarchy. "Anarchism" appears in English from 1642 as "anarchisme" and "anarchy" from 1539. Various factions within the French Revolution labelled their opponents as "anarchists", although few such accused shared many views with later anarchists. Many revolutionaries of the 19th century such as William Godwin (1756–1836) and Wilhelm Weitling (1808–1871) would contribute to the anarchist doctrines of the next generation, but they did not use "anarchist" or "anarchism" in describing themselves or their beliefs.
+
+The first political philosopher to call himself an "anarchist" () was Pierre-Joseph Proudhon (1809–1865), marking the formal birth of anarchism in the mid-19th century. Since the 1890s and beginning in France, "libertarianism" has often been used as a synonym for anarchism and its use as a synonym is still common outside the United States. On the other hand, some use "libertarianism" to refer to individualistic free-market philosophy only, referring to free-market anarchism as "libertarian anarchism".
+
+While opposition to the state is central to anarchist thought, defining anarchism is not an easy task as there is a lot of discussion among scholars and anarchists on the matter and various currents perceive anarchism slightly differently. Hence, it might be true to say that anarchism is a cluster of political philosophies opposing authority and hierarchical organization (including the state, capitalism, nationalism and all associated institutions) in the conduct of all human relations in favour of a society based on voluntary association, on freedom and on decentralisation, but this definition has the same shortcomings as the definition based on etymology (which is simply a negation of a ruler), or based on anti-statism (anarchism is much more than that) or even the anti-authoritarian (which is an "a posteriori" conclusion). Nonetheless, major elements of the definition of anarchism include the following:
+
+During the prehistoric era of mankind, an established authority did not exist. It was after the creation of towns and cities that institutions of authority were established and anarchistic ideas espoused as a reaction. Most notable precursors to anarchism in the ancient world were in China and Greece. In China, philosophical anarchism (i.e. the discussion on the legitimacy of the state) was delineated by Taoist philosophers Zhuang Zhou and Laozi.
+
+Likewise, anarchic attitudes were articulated by tragedians and philosophers in Greece. Aeschylus and Sophocles used the myth of Antigone to illustrate the conflict between rules set by the state and personal autonomy. Socrates questioned Athenian authorities constantly and insisted to the right of individual freedom of consciousness. Cynics dismissed human law ("nomos") and associated authorities while trying to live according to nature ("physis"). Stoics were supportive of a society based on unofficial and friendly relations among its citizens without the presence of a state.
+
+During the Middle Ages, there was no anarchistic activity except some ascetic religious movements in the Muslim world or in Christian Europe. This kind of tradition later gave birth to religious anarchism. In the Sasanian Empire, Mazdak called for an egalitarian society and the abolition of monarchy, only to be soon executed by Emperor Kavad I.
+
+In Basra, religious sects preached against the state. In Europe, various sects developed anti-state and libertarian tendencies. Libertarian ideas further emerged during the Renaissance with the spread of reasoning and humanism through Europe. Novelists fictionalised ideal societies that were based not on coercion but voluntarism. The Enlightenment further pushed towards anarchism with the optimism for social progress.
+
+During the French Revolution, partisan groups such as the Enragés and the saw a turning point in the fermentation of anti-state and federalist sentiments. The first anarchist currents developed throughout the 18th century—William Godwin espoused philosophical anarchism in England, morally delegitimizing the state, Max Stirner's thinking paved the way to individualism, and Pierre-Joseph Proudhon's theory of mutualism found fertile soil in France. This era of classical anarchism lasted until the end of the Spanish Civil War of 1936 and is considered the golden age of anarchism.
+Drawing from mutualism, Mikhail Bakunin founded collectivist anarchism and entered the International Workingmen's Association, a class worker union later known as the First International that formed in 1864 to unite diverse revolutionary currents. The International became a significant political force, with Karl Marx being a leading figure and a member of its General Council. Bakunin's faction (the Jura Federation) and Proudhon's followers (the mutualists) opposed Marxist state socialism, advocating political abstentionism and small property holdings. After bitter disputes, the Bakuninists were expelled from the International by the Marxists at the 1872 Hague Congress. Bakunin famously predicted that if revolutionaries gained power by Marx's terms, they would end up the new tyrants of workers. After being expelled, anarchists formed the St. Imier International. Under the influence of Peter Kropotkin, a Russian philosopher and scientist, anarcho-communism overlapped with collectivism. Anarcho-communists, who drew inspiration from the 1871 Paris Commune, advocated for free federation and for the distribution of goods according to one's needs.
+
+At the turn of the century, anarchism had spread all over the world. In China, small groups of students imported the humanistic pro-science version of anarcho-communism. Tokyo was a hotspot for rebellious youth from countries of the far east, travelling to the Japanese capital to study. In Latin America, Argentina was a stronghold for anarcho-syndicalism, where it became the most prominent left-wing ideology. During this time, a minority of anarchists adopted tactics of revolutionary political violence. This strategy became known as propaganda of the deed. The dismemberment of the French socialist movement into many groups, and the execution and exile of many Communards to penal colonies following the suppression of the Paris Commune, favoured individualist political expression and acts. Even though many anarchists distanced themselves from these terrorist acts, infamy came upon the movement. Illegalism was another strategy which some anarchists adopted during this period.
+Anarchists enthusiastically participated in the Russian Revolution—despite concerns—in opposition to the Whites. However, they met harsh suppression after the Bolshevik government was stabilized. Several anarchists from Petrograd and Moscow fled to Ukraine, notably leading to the Kronstadt rebellion and Nestor Makhno's struggle in the Free Territory. With the anarchists being crushed in Russia, two new antithetical currents emerged, namely platformism and synthesis anarchism. The former sought to create a coherent group that would push for revolution while the latter were against anything that would resemble a political party. Seeing the victories of the Bolsheviks in the October Revolution and the resulting Russian Civil War, many workers and activists turned to communist parties, which grew at the expense of anarchism and other socialist movements. In France and the United States, members of major syndicalist movements, the General Confederation of Labour and Industrial Workers of the World, left their organisations and joined the Communist International.
+
+In the Spanish Civil War, anarchists and syndicalists (CNT and FAI) once again allied themselves with various currents of leftists. A long tradition of Spanish anarchism led to anarchists playing a pivotal role in the war. In response to the army rebellion, an anarchist-inspired movement of peasants and workers, supported by armed militias, took control of Barcelona and of large areas of rural Spain, where they collectivised the land. The Soviet Union provided some limited assistance at the beginning of the war, but the result was a bitter fight among communists and anarchists at a series of events named May Days as Joseph Stalin tried to seize control of the Republicans.
+
+At the end of World War II, the anarchist movement was severely weakened. However, the 1960s witnessed a revival of anarchism likely caused by a perceived failure of Marxism–Leninism and tensions built by the Cold War. During this time, anarchism took root in other movements critical towards both the state and capitalism, such as the anti-nuclear, environmental and pacifist movements, the New Left, and the counterculture of the 1960s. Anarchism became associated with punk subculture, as exemplified by bands such as Crass and the Sex Pistols, and the established feminist tendencies of anarcha-feminism returned with vigour during the second wave of feminism.
+
+Around the turn of the 21st century, anarchism grew in popularity and influence within anti-war, anti-capitalist, and anti-globalisation movements. Anarchists became known for their involvement in protests against the World Trade Organization, the Group of Eight and the World Economic Forum. During the protests, "ad hoc" leaderless anonymous cadres known as black blocs engaged in rioting, property destruction, and violent confrontations with the police. Other organisational tactics pioneered in this time include security culture, affinity groups, and the use of decentralised technologies such as the internet. A significant event of this period was the confrontations at the WTO conference in Seattle in 1999. Anarchist ideas have been influential in the development of the Zapatistas in Mexico and the Democratic Federation of Northern Syria, more commonly known as Rojava, a "de facto" autonomous region in northern Syria.
+
+Anarchist schools of thought have been generally grouped into two main historical traditions, social anarchism and individualist anarchism, owing to their different origins, values and evolution. The individualist current emphasises negative liberty in opposing restraints upon the free individual, while the social current emphasises positive liberty in aiming to achieve the free potential of society through equality and social ownership. In a chronological sense, anarchism can be segmented by the classical currents of the late 19th century, and the post-classical currents (such as anarcha-feminism, green anarchism and post-anarchism) developed thereafter.
+
+Beyond the specific factions of anarchist movements which constitute political anarchism lies philosophical anarchism, which holds that the state lacks moral legitimacy, without necessarily accepting the imperative of revolution to eliminate it. A component especially of individualist anarchism, philosophical anarchism may tolerate the existence of a minimal state, but argues that citizens have no moral obligation to obey government when it conflicts with individual autonomy. Anarchism pays significant attention to moral arguments since ethics have a central role in anarchist philosophy.
+
+One reaction against sectarianism within the anarchist milieu was anarchism without adjectives, a call for toleration and unity among anarchists first adopted by Fernando Tarrida del Mármol in 1889 in response to the bitter debates of anarchist theory at the time. Despite separation, the various anarchist schools of thought are not seen as distinct entities, but as tendencies that intermingle.
+
+Anarchism is usually placed on the far-left of the political spectrum. Much of its economics and legal philosophy reflect anti-authoritarian, anti-statist, and libertarian interpretations of the radical left-wing and socialist politics of collectivism, communism, individualism, mutualism, and syndicalism, among other libertarian socialist economic theories. As anarchism does not offer a fixed body of doctrine from a single particular worldview, many anarchist types and traditions exist, and varieties of anarchy diverge widely.
+
+Inceptive currents among classical anarchist currents were mutualism and individualism. They were followed by the major currents of social anarchism (collectivist, communist, and syndicalist). They differ on organizational and economic aspects of their ideal society.
+
+Mutualism is an 18th-century economic theory that was developed into anarchist theory by Pierre-Joseph Proudhon. Its aims include reciprocity, free association, voluntary contract, federation, and credit and currency reform that would be regulated by a bank of the people. Mutualism has been retrospectively characterised as ideologically situated between individualist and collectivist forms of anarchism. Proudhon first characterised his goal as a "third form of society, the synthesis of communism and property".
+
+Collectivist anarchism, also known as anarchist collectivism or anarcho-collectivism, is a revolutionary socialist form of anarchism commonly associated with Mikhail Bakunin. Collectivist anarchists advocate collective ownership of the means of production, theorised to be achieved through violent revolution, and that workers be paid according to time worked, rather than goods being distributed according to need as in communism. Collectivist anarchism arose alongside Marxism, but rejected the dictatorship of the proletariat despite the stated Marxist goal of a collectivist stateless society. Anarcho-communism, also known as anarchist-communism, communist anarchism, and libertarian communism, is a theory of anarchism that advocates a communist society with common ownership of the means of production, direct democracy, and a horizontal network of voluntary associations and workers' councils with production and consumption based on the guiding principle: "From each according to his ability, to each according to his need". Anarcho-communism developed from radical socialist currents after the French Revolution, but it was first formulated as such in the Italian section of the First International. It was later expanded upon in the theoretical work of Peter Kropotkin.
+
+Anarcho-syndicalism, also referred to as revolutionary syndicalism, is a branch of anarchism that views labour syndicates as a potential force for revolutionary social change, replacing capitalism and the state with a new society democratically self-managed by workers. The basic principles of anarcho-syndicalism are workers' solidarity, direct action, and workers' self-management.
+
+Individualist anarchism refers to several traditions of thought within the anarchist movement that emphasise the individual and their will over any kinds of external determinants. Early influences on individualist forms of anarchism include William Godwin, Max Stirner and Henry David Thoreau. Through many countries, individualist anarchism attracted a small yet diverse following of Bohemian artists and intellectuals as well as young anarchist outlaws in what became known as illegalism and individual reclamation.
+
+Anarchist principles undergird contemporary radical social movements of the left. Interest in the anarchist movement developed alongside momentum in the anti-globalization movement, whose leading activist networks were anarchist in orientation. As the movement shaped 21st century radicalism, wider embrace of anarchist principles signaled a revival of interest. Contemporary news coverage which emphasizes black bloc demonstrations has reinforced anarchism's historical association with chaos and violence, although its publicity has also led more scholars to engage with the anarchist movement. Anarchism has continued to generate many philosophies and movements—at times eclectic, drawing upon various sources, and syncretic, combining disparate concepts to create new philosophical approaches. The anti-capitalist tradition of classical anarchism has remained prominent within contemporary currents.
+
+Various anarchist groups, tendencies, and schools of thought exist today, making it difficult to describe contemporary anarchist movement. While theorists and activists have established "relatively stable constellations of anarchist principles", there is no consensus on which principles are core. As a result, commentators describe multiple "anarchisms" (rather than a singular "anarchism") in which common principles are shared between schools of anarchism while each group prioritizes those principles differently. For example, gender equality can be a common principle but ranks as a higher priority to anarcha-feminists than anarchist communists. Anarchists are generally committed against coercive authority in all forms, namely "all centralized and hierarchical forms of government (e.g., monarchy, representative democracy, state socialism, etc.), economic class systems (e.g., capitalism, Bolshevism, feudalism, slavery, etc.), autocratic religions (e.g., fundamentalist Islam, Roman Catholicism, etc.), patriarchy, heterosexism, white supremacy, and imperialism". However, anarchist schools disagree on the methods by which these forms should be opposed.
+
+Anarchists' tactics take various forms but in general serve two major goals—first, to oppose the Establishment; and second, to promote anarchist ethics and reflect an anarchist vision of society, illustrating the unity of means and ends. A broad categorization can be made between aims to destroy oppressive states and institutions by revolutionary means, and aims to change society through evolutionary means. Evolutionary tactics reject violence and take a gradual approach to anarchist aims, though there is significant overlap between the two.
+
+Anarchist tactics have shifted during the course of the last century. Anarchists during the early 20th century focused more on strikes and militancy, while contemporary anarchists use a broader array of approaches.
+
+During the classical era, anarchists had a militant tendency. Not only did they confront state armed forces (as in Spain and Ukraine) but some of them also employed terrorism as propaganda of the deed. Assassination attempts were carried out against heads of state, some of which were successful. Anarchists also took part in revolutions. Anarchist perspectives towards violence have always been perplexing and controversial. On one hand, anarcho-pacifists point out the unity of means and ends. On the other hand, other anarchist groups advocate direct action, a tactic which can include acts of sabotage or even acts of terrorism. This attitude was quite prominent a century ago; seeing the state as a tyrant, some anarchists believed that they had every right to oppose its oppression by any means possible. Emma Goldman and Errico Malatesta, who were proponents of limited use of violence, argued that violence is merely a reaction to state violence as a necessary evil.
+
+Anarchists took an active role in strikes, although they tended to be antipathetic to formal syndicalism, seeing it as reformist. They saw it as a part of the movement which sought to overthrow the state and capitalism. Anarchists also reinforced their propaganda within the arts, some of whom practiced nudism. They also built communities which were based on friendship. They were also involved in the press.
+
+In the current era, Italian anarchist Alfredo Bonanno, a proponent of insurrectionary anarchism, has reinstated the debate on violence by rejecting the nonviolence tactic adopted since the late 19th century by Kropotkin and other prominent anarchists afterwards. Both Bonanno and the French group The Invisible Committee advocate for small, informal affiliation groups, where each member is responsible for their own actions but works together to bring down oppression utilizing sabotage and other violent means against state, capitalism and other enemies. Members of The Invisible Committee were arrested in 2008 on various charges, terrorism included.
+
+Overall, today's anarchists are much less violent and militant than their ideological ancestors. They mostly engage in confronting the police during demonstrations and riots, especially in countries like Canada, Mexico or Greece. Μilitant black bloc protest groups are known for clashing with the police. However, anarchists not only clash with state operators; they also engage in the struggle against fascists and racists, taking anti-fascist action and mobilizing to prevent hate rallies from happening.
+
+Anarchists commonly employ direct action. This can take the form of disrupting and protesting against unjust hierarchy, or the form of self-managing their lives through the creation of counter-institutions such as communes and non-hierarchical collectives. Often, decision-making is handled in an anti-authoritarian way, with everyone having equal say in each decision, an approach known as horizontalism. Contemporary-era anarchists have been engaging with various grassroots movements that are not explicitly anarchist but are more or less based on horizontalism, respecting personal autonomy, and participating in mass activism such as strikes and demonstrations. The newly coined term "small-a anarchism", in contrast with the "big-A anarchism" of the classical era, signals their tendency not to base their thoughts and actions on classical-era anarchism or to refer to Kropotkin or Proudhon to justify their opinions. They would rather base their thought and praxis on their own experience, which they will later theorize.
+
+The decision-making process of small affinity anarchist groups play a significant tactical role. Anarchists have employed various methods in order to build a rough consensus among members of their group, without the need of a leader or a leading group. One way is for an individual from the group to play the role of facilitator to help achieve a consensus without taking part in the discussion themselves or promoting a specific point. Minorities usually accept rough consensus, except when they feel the proposal contradicts anarchist goals, values, or ethics. Anarchists usually form small groups (5–20 individuals) to enhance autonomy and friendships among their members. These kind of groups more often than not interconnect with each other, forming larger networks. Anarchists still support and participate in strikes, especially wildcat strikes; these are leaderless strikes not organised centrally by a syndicate.
+
+Anarchists have gone online to spread their message. As in the past, newspapers and journals are used; however, because of distributional and other difficulties, anarchists have found it easier to create websites, hosting electronic libraries and other portals. Anarchists were also involved in developing various software that are available for free. The way these hacktivists work to develop and distribute resembles the anarchist ideals, especially when it comes to preserving user's privacy from state surveillance.
+
+Anarchists organize themselves to squat and reclaim public spaces. During important events such as protests and when spaces are being occupied, they are often called Temporary Autonomous Zones (TAZ), spaces where surrealism, poetry and art are blended to display the anarchist ideal. As seen by anarchists, squatting is a way to regain urban space from the capitalist market, serving pragmatical needs, and is also seen an exemplary direct action. Acquiring space enables anarchists to experiment with their ideas and build social bonds. Adding up these tactics, and having in mind that not all anarchists share the same attitudes towards them, along with various forms of protesting at highly symbolic events, make up a carnivalesque atmosphere that is part of contemporary anarchist vividity.
+
+As anarchism is a philosophy that embodies many diverse attitudes, tendencies, and schools of thought, and disagreement over questions of values, ideology, and tactics is common, its diversity has led to widely different uses of identical terms among different anarchist traditions, which has created a number of definitional concerns in anarchist theory. For instance, the compatibility of capitalism, nationalism and religion with anarchism is widely disputed. Similarly, anarchism enjoys complex relationships with ideologies such as Marxism, communism, collectivism and trade unionism. Anarchists may be motivated by humanism, divine authority, enlightened self-interest, veganism, or any number of alternative ethical doctrines. Phenomena such as civilisation, technology (e.g. within anarcho-primitivism) and the democratic process may be sharply criticised within some anarchist tendencies and simultaneously lauded in others.
+
+Gender and sexuality carry along them dynamics of hierarchy; anarchism is obliged to address, analyse and oppose the suppression of one's autonomy because of the dynamics that gender roles traditionally impose.
+
+A historical current that arose and flourished during 1890 and 1920 within anarchism was free love; in contemporary anarchism, this current survives as a tendency to support polyamory and queer anarchism. Free love advocates were against marriage, which they saw as a way of men imposing authority over women, largely because marriage law greatly favoured the power of men. The notion of free love, though, was much broader; it included critique of the established order that limited women's sexual freedom and pleasure. Such free love movements contributed to the establishment of communal houses, where large groups of travelers, anarchists, and other activists slept in beds together. Free love had roots both in Europe and the United States. Some anarchists, however, struggled with the jealousy that arose from free love. Anarchist feminists were advocates of free love, against marriage, were pro-choice (utilizing a contemporary term) and had a likewise agenda. Anarchist and non-anarchist feminists differed on suffrage, but were nonetheless supportive of one another.
+
+During the second half of the 20th century, anarchism intermingled with the second wave of feminism, radicalizing some currents of the feminist movement (and being influenced as well). By the latest decades of the 20th century, anarchists and feminists were advocating for the rights and autonomy of women, gays, queers and other marginalized groups, with some feminist thinkers suggesting a fusion of the two currents. With the third wave of feminism, sexual identity and compulsory heterosexuality became a subject of study for anarchists, which yielded a post-structuralist critique of sexual normality. However, some anarchists distanced themselves from this line of thinking, suggesting that it leaned towards individualism and was, therefore, dropping the cause of social liberation.
+
+The interest of anarchists in education stretches back to the first emergence of classical anarchism. Anarchists consider 'proper' education, which sets the foundations of the future autonomy of the individual and the society, to be an act of mutual aid. Anarchist writers such as Willian Godwin and Max Stirner attacked both state education and private education as another means by which the ruling class replicate their privileges.
+
+In 1901, Catalan anarchist and free thinker Francisco Ferrer established the Escuela Moderna in Barcelona as an opposition to the established education system, which was dictated largely by the Catholic Church. Ferrer's approach was secular, rejecting both state and church involvement in the educational process, and gave pupils large amounts of autonomy in planning their work and attendance. Ferrer aimed to educate the working class and explicitly sought to foster class consciousness among students. The school closed after constant harassment by the state and Ferrer was later arrested. His ideas, however, formed the inspiration for a series of modern schools around the world. Christian anarchist Leo Tolstoy also established a similar school, with its founding principle, according to Tolstoy, being that "for education to be effective it had to be free". In a similar token, A. S. Neill founding what became Summerhill School in 1921, also declaring being free from coercion. 
+
+Anarchist education is based largely on the idea that a child's right to develop freely, without manipulation, ought to be respected, and that rationality will lead children to morally good conclusions. However, there has been little consensus among anarchist figures as to what constitutes manipulation; Ferrer, for example, believed that moral indoctrination was necessary and explicitly taught pupils that equality, liberty, and social justice were not possible under capitalism (along with other critiques of nationalism and government). 
+
+Late 20th century and contemporary anarchist writers (such as Colin Ward, Herbert Read and Paul Goodman) intensified and expanded the anarchist critique of state education, largely focusing on the need for a system that focuses on children's creativity rather than on their ability to attain a career or participate in consumer society. Contemporary anarchists, such as Colin Ward, have further argued that state education serves to perpetuate socio-economic inequality.
+
+While few anarchist education institutions have survived to the modern day, major tenets of anarchist schools, such as respect for child autonomy and relying on reasoning rather than indoctrination as a teaching method, have spread among mainstream educational institutions.
+
+Objection to the state and its institutions is a "sine qua non" of anarchism. Anarchists consider the state as a tool of domination and believe it to be illegitimate regardless of its political tendencies. Instead of people being able to control the aspects of their life, major decisions are taken by a small elite. Authority ultimately rests solely on power, regardless of whether that power is open or transparent, as it still has the ability to coerce people. Another anarchist argument against states is that the people constituting a government, even the most altruistic among officials, will unavoidably seek to gain more power, leading to corruption. Anarchists consider the idea that the state is the collective will of the people to be an unachievable fiction, due to the fact that the ruling class is distinct from the rest of society.
+
+The connection between anarchism and art was quite profound during the classical era of anarchism, especially among artistic currents that were developing during that era, such as futurists, surrealists, and others, while in literature anarchism was mostly associated with the New Apocalyptics and the Neo-romanticism movement. In music, anarchism has been associated with music scenes such as Punk. Anarchists such as Leo Tolstoy and Herbert Read argued that the border between the artist and the non-artist, what separates art from a daily act, is a construct produced by the alienation caused by capitalism, and it prevents humans from living a joyful life. 
+
+Other anarchists advocated for or used art as a means to achieve anarchist ends. In his book Breaking the Spell: A History of Anarchist Filmmakers, Videotape Guerrillas, and Digital Ninjas Chris Robé claims that "anarchist-inflected practices have increasingly structured movement-based video activism." 
+
+Three overlapping properties made art useful to anarchists: It could depict a critique of existing society and hierarchies; it could serve as a prefigurative tool to reflect the anarchist ideal society, and also it could turn into a means of direct action, in protests for example. As it appeals to both emotion and reason, art could appeal to the "whole human" and have a powerful effect.
+
+Philosophy lecturer Andrew G. Fiala has listed five main arguments against anarchism. Firstly, he notes that anarchism is related to violence and destruction, not only in the pragmatic world (i.e. at protests) but in the world of ethics as well. The second argument is that it is impossible for a society to function without a state or something like a state, acting to protect citizens from criminality. Fiala takes "Leviathan" from Thomas Hobbes and the night-watchman state from philosopher Robert Nozick as examples. Thirdly, anarchism is evaluated as unfeasible or utopian since the state can not be defeated practically; this line of arguments most often calls for political action within the system to reform it. The fourth argument is that anarchism is self-contradictory since while it advocates for no-one to "archiei", if accepted by the many, then anarchism will turn into the ruling political theory. In this line of criticism also comes the self contradiction that anarchist calls for collective action while anarchism endorses the autonomy of the individual and hence no collective action can be taken. Lastly, Fiala mentions a critique towards philosophical anarchism, of being ineffective (all talk and thoughts) and in the meantime capitalism and bourgeois class remains strong.
+
+Philosophical anarchism has met the criticism of members of academia, following the release of pro-anarchist books such as A. John Simmons' "Moral Principles and Political Obligations" (1979). Law professor William A. Edmundson authored an essay arguing against three major philosophical anarchist principles, which he finds fallacious; Edmundson claims that while the individual does not owe a normal state a duty of obedience, this does not imply that anarchism is the inevitable conclusion, and the state is still morally legitimate.
+
+
+
+
+
+
+
+</doc>
+<doc id="25" url="https://en.wikipedia.org/wiki?curid=25" title="Autism">
+Autism
+
+Autism is a developmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior. Parents often notice signs during the first three years of their child's life. These signs often develop gradually, though some children with autism experience worsening in their communication and social skills after reaching developmental milestones at a normal pace.
+Autism is associated with a combination of genetic and environmental factors. Risk factors during pregnancy include certain infections, such as rubella, toxins including valproic acid, alcohol, cocaine, pesticides, lead, and air pollution, fetal growth restriction, and autoimmune diseases. Controversies surround other proposed environmental causes; for example, the vaccine hypothesis, which has been disproven. Autism affects information processing in the brain and how nerve cells and their synapses connect and organize; how this occurs is not well understood. The Diagnostic and Statistical Manual of Mental Disorders (DSM-5), combines autism and less severe forms of the condition, including Asperger syndrome and pervasive developmental disorder not otherwise specified (PDD-NOS) into the diagnosis of autism spectrum disorder (ASD).
+Early behavioral interventions or speech therapy can help children with autism gain self-care, social, and communication skills. Although there is no known cure, there have been cases of children who recovered. Some autistic adults are unable to live independently. An autistic culture has developed, with some individuals seeking a cure and others believing autism should be accepted as a difference to be accommodated instead of cured.
+Globally, autism is estimated to affect 24.8 million people . In the 2000s, the number of people affected was estimated at 1–2 per 1,000 people worldwide. In the developed countries, about 1.5% of children are diagnosed with ASD , from 0.7% in 2000 in the United States. It occurs four-to-five times more often in males than females. The number of people diagnosed has increased dramatically since the 1960s, which may be partly due to changes in diagnostic practice. The question of whether actual rates have increased is unresolved.
+Autism is a highly variable, neurodevelopmental disorder whose symptoms first appears during infancy or childhood, and generally follows a steady course without remission. People with autism may be severely impaired in some respects but average, or even superior, in others. Overt symptoms gradually begin after the age of six months, become established by age two or three years and tend to continue through adulthood, although often in more muted form. It is distinguished by a characteristic triad of symptoms: impairments in social interaction, impairments in communication, and repetitive behavior. Other aspects, such as atypical eating, are also common but are not essential for diagnosis. Individual symptoms of autism occur in the general population and appear not to associate highly, without a sharp line separating pathologically severe from common traits.
+
+Social deficits distinguish autism and the related autism spectrum disorders (ASD; see Classification) from other developmental disorders. People with autism have social impairments and often lack the intuition about others that many people take for granted. Noted autistic Temple Grandin described her inability to understand the social communication of neurotypicals, or people with typical neural development, as leaving her feeling "like an anthropologist on Mars".
+
+Unusual social development becomes apparent early in childhood. Autistic infants show less attention to social stimuli, smile and look at others less often, and respond less to their own name. Autistic toddlers differ more strikingly from social norms; for example, they have less eye contact and turn-taking, and do not have the ability to use simple movements to express themselves, such as pointing at things. Three- to five-year-old children with autism are less likely to exhibit social understanding, approach others spontaneously, imitate and respond to emotions, communicate nonverbally, and take turns with others. However, they do form attachments to their primary caregivers. Most children with autism display moderately less attachment security than neurotypical children, although this difference disappears in children with higher mental development or less pronounced autistic traits. Older children and adults with ASD perform worse on tests of face and emotion recognition although this may be partly due to a lower ability to define a person's own emotions.
+
+Children with high-functioning autism have more intense and frequent loneliness compared to non-autistic peers, despite the common belief that children with autism prefer to be alone. Making and maintaining friendships often proves to be difficult for those with autism. For them, the quality of friendships, not the number of friends, predicts how lonely they feel. Functional friendships, such as those resulting in invitations to parties, may affect the quality of life more deeply.
+There are many anecdotal reports, but few systematic studies, of aggression and violence in individuals with ASD. The limited data suggest that, in children with intellectual disability, autism is associated with aggression, destruction of property, and meltdowns.
+
+About a third to a half of individuals with autism do not develop enough natural speech to meet their daily communication needs. Differences in communication may be present from the first year of life, and may include delayed onset of babbling, unusual gestures, diminished responsiveness, and vocal patterns that are not synchronized with the caregiver. In the second and third years, children with autism have less frequent and less diverse babbling, consonants, words, and word combinations; their gestures are less often integrated with words. Children with autism are less likely to make requests or share experiences, and are more likely to simply repeat others' words (echolalia) or reverse pronouns. Joint attention seems to be necessary for functional speech, and deficits in joint attention seem to distinguish infants with ASD. For example, they may look at a pointing hand instead of the pointed-at object, and they consistently fail to point at objects in order to comment on or share an experience. Children with autism may have difficulty with imaginative play and with developing symbols into language.
+
+In a pair of studies, high-functioning children with autism aged 8–15 performed equally well as, and as adults better than, individually matched controls at basic language tasks involving vocabulary and spelling. Both autistic groups performed worse than controls at complex language tasks such as figurative language, comprehension and inference. As people are often sized up initially from their basic language skills, these studies suggest that people speaking to autistic individuals are more likely to overestimate what their audience comprehends.
+
+Autistic individuals can display many forms of repetitive or restricted behavior, which the Repetitive Behavior Scale-Revised (RBS-R) categorizes as follows.
+
+
+No single repetitive or self-injurious behavior seems to be specific to autism, but autism appears to have an elevated pattern of occurrence and severity of these behaviors.
+
+Autistic individuals may have symptoms that are independent of the diagnosis, but that can affect the individual or the family.
+An estimated 0.5% to 10% of individuals with ASD show unusual abilities, ranging from splinter skills such as the memorization of trivia to the extraordinarily rare talents of prodigious autistic savants. Many individuals with ASD show superior skills in perception and attention, relative to the general population. Sensory abnormalities are found in over 90% of those with autism, and are considered core features by some, although there is no good evidence that sensory symptoms differentiate autism from other developmental disorders. Differences are greater for under-responsivity (for example, walking into things) than for over-responsivity (for example, distress from loud noises) or for sensation seeking (for example, rhythmic movements). An estimated 60–80% of autistic people have motor signs that include poor muscle tone, poor motor planning, and toe walking; deficits in motor coordination are pervasive across ASD and are greater in autism proper. Unusual eating behavior occurs in about three-quarters of children with ASD, to the extent that it was formerly a diagnostic indicator. Selectivity is the most common problem, although eating rituals and food refusal also occur.
+
+There is tentative evidence that autism occurs more frequently in people with gender dysphoria.
+
+Gastrointestinal problems are one of the most commonly associated medical disorders in people with autism. These are linked to greater social impairment, irritability, behavior and sleep problems, language impairments and mood changes.
+
+Parents of children with ASD have higher levels of stress. Siblings of children with ASD report greater admiration of and less conflict with the affected sibling than siblings of unaffected children and were similar to siblings of children with Down syndrome in these aspects of the sibling relationship. However, they reported lower levels of closeness and intimacy than siblings of children with Down syndrome; siblings of individuals with ASD have greater risk of negative well-being and poorer sibling relationships as adults.
+
+It has long been presumed that there is a common cause at the genetic, cognitive, and neural levels for autism's characteristic triad of symptoms. However, there is increasing suspicion that autism is instead a complex disorder whose core aspects have distinct causes that often co-occur.
+Autism has a strong genetic basis, although the genetics of autism are complex and it is unclear whether ASD is explained more by rare mutations with major effects, or by rare multigene interactions of common genetic variants. Complexity arises due to interactions among multiple genes, the environment, and epigenetic factors which do not change DNA sequencing but are heritable and influence gene expression. Many genes have been associated with autism through sequencing the genomes of affected individuals and their parents. Studies of twins suggest that heritability is 0.7 for autism and as high as 0.9 for ASD, and siblings of those with autism are about 25 times more likely to be autistic than the general population. However, most of the mutations that increase autism risk have not been identified. Typically, autism cannot be traced to a Mendelian (single-gene) mutation or to a single chromosome abnormality, and none of the genetic syndromes associated with ASDs have been shown to selectively cause ASD. Numerous candidate genes have been located, with only small effects attributable to any particular gene. Most loci individually explain less than 1% of cases of autism. The large number of autistic individuals with unaffected family members may result from spontaneous structural variation—such as deletions, duplications or inversions in genetic material during meiosis. Hence, a substantial fraction of autism cases may be traceable to genetic causes that are highly heritable but not inherited: that is, the mutation that causes the autism is not present in the parental genome. Autism may be underdiagnosed in women and girls due to an assumption that it is primarily a male condition, but genetic phenomena such as imprinting and X linkage have the ability to raise the frequency and severity of conditions in males, and theories have been put forward for a genetic reason why males are diagnosed more often, such as the imprinted brain theory and the extreme male brain theory.
+
+Maternal nutrition and inflammation during preconception and pregnancy influences fetal neurodevelopment. Intrauterine growth restriction is associated with ASD, in both term and preterm infants. Maternal inflammatory and autoimmune diseases may damage fetal tissues, aggravating a genetic problem or damaging the nervous system.
+
+Exposure to air pollution during pregnancy, especially heavy metals and particulates, may increase the risk of autism. Environmental factors that have been claimed without evidence to contribute to or exacerbate autism include certain foods, infectious diseases, solvents, PCBs, phthalates and phenols used in plastic products, pesticides, brominated flame retardants, alcohol, smoking, illicit drugs, vaccines, and prenatal stress. Some, such as the MMR vaccine, have been completely disproven.
+
+Parents may first become aware of autistic symptoms in their child around the time of a routine vaccination. This has led to unsupported theories blaming vaccine "overload", a vaccine preservative, or the MMR vaccine for causing autism. The latter theory was supported by a litigation-funded study that has since been shown to have been "an elaborate fraud". Although these theories lack convincing scientific evidence and are biologically implausible, parental concern about a potential vaccine link with autism has led to lower rates of childhood immunizations, outbreaks of previously controlled childhood diseases in some countries, and the preventable deaths of several children.
+
+Autism's symptoms result from maturation-related changes in various systems of the brain. How autism occurs is not well understood. Its mechanism can be divided into two areas: the pathophysiology of brain structures and processes associated with autism, and the neuropsychological linkages between brain structures and behaviors. The behaviors appear to have multiple pathophysiologies.
+
+There is evidence that gut–brain axis abnormalities may be involved. A 2015 review proposed that immune dysregulation, gastrointestinal inflammation, malfunction of the autonomic nervous system, gut flora alterations, and food metabolites may cause brain neuroinflammation and dysfunction. A 2016 review concludes that enteric nervous system abnormalities might play a role in neurological disorders such as autism. Neural connections and the immune system are a pathway that may allow diseases originated in the intestine to spread to the brain.
+
+Several lines of evidence point to synaptic dysfunction as a cause of autism. Some rare mutations may lead to autism by disrupting some synaptic pathways, such as those involved with cell adhesion. Gene replacement studies in mice suggest that autistic symptoms are closely related to later developmental steps that depend on activity in synapses and on activity-dependent changes. All known teratogens (agents that cause birth defects) related to the risk of autism appear to act during the first eight weeks from conception, and though this does not exclude the possibility that autism can be initiated or affected later, there is strong evidence that autism arises very early in development.
+
+Diagnosis is based on behavior, not cause or mechanism. Under the DSM-5, autism is characterized by persistent deficits in social communication and interaction across multiple contexts, as well as restricted, repetitive patterns of behavior, interests, or activities. These deficits are present in early childhood, typically before age three, and lead to clinically significant functional impairment. Sample symptoms include lack of social or emotional reciprocity, stereotyped and repetitive use of language or idiosyncratic language, and persistent preoccupation with unusual objects. The disturbance must not be better accounted for by Rett syndrome, intellectual disability or global developmental delay. ICD-10 uses essentially the same definition.
+
+Several diagnostic instruments are available. Two are commonly used in autism research: the Autism Diagnostic Interview-Revised (ADI-R) is a semistructured parent interview, and the Autism Diagnostic Observation Schedule (ADOS) uses observation and interaction with the child. The Childhood Autism Rating Scale (CARS) is used widely in clinical environments to assess severity of autism based on observation of children. The Diagnostic interview for social and communication disorders (DISCO) may also be used.
+
+A pediatrician commonly performs a preliminary investigation by taking developmental history and physically examining the child. If warranted, diagnosis and evaluations are conducted with help from ASD specialists, observing and assessing cognitive, communication, family, and other factors using standardized tools, and taking into account any associated medical conditions. A pediatric neuropsychologist is often asked to assess behavior and cognitive skills, both to aid diagnosis and to help recommend educational interventions. A differential diagnosis for ASD at this stage might also consider intellectual disability, hearing impairment, and a specific language impairment such as Landau–Kleffner syndrome. The presence of autism can make it harder to diagnose coexisting psychiatric disorders such as depression.
+
+Clinical genetics evaluations are often done once ASD is diagnosed, particularly when other symptoms already suggest a genetic cause. Although genetic technology allows clinical geneticists to link an estimated 40% of cases to genetic causes, consensus guidelines in the US and UK are limited to high-resolution chromosome and fragile X testing. A genotype-first model of diagnosis has been proposed, which would routinely assess the genome's copy number variations. As new genetic tests are developed several ethical, legal, and social issues will emerge. Commercial availability of tests may precede adequate understanding of how to use test results, given the complexity of autism's genetics. Metabolic and neuroimaging tests are sometimes helpful, but are not routine.
+
+ASD can sometimes be diagnosed by age 14 months, although diagnosis becomes increasingly stable over the first three years of life: for example, a one-year-old who meets diagnostic criteria for ASD is less likely than a three-year-old to continue to do so a few years later. In the UK the National Autism Plan for Children recommends at most 30 weeks from first concern to completed diagnosis and assessment, though few cases are handled that quickly in practice. Although the symptoms of autism and ASD begin early in childhood, they are sometimes missed; years later, adults may seek diagnoses to help them or their friends and family understand themselves, to help their employers make adjustments, or in some locations to claim disability living allowances or other benefits. Girls are often diagnosed later than boys.
+
+Underdiagnosis and overdiagnosis are problems in marginal cases, and much of the recent increase in the number of reported ASD cases is likely due to changes in diagnostic practices. The increasing popularity of drug treatment options and the expansion of benefits has given providers incentives to diagnose ASD, resulting in some overdiagnosis of children with uncertain symptoms. Conversely, the cost of screening and diagnosis and the challenge of obtaining payment can inhibit or delay diagnosis. It is particularly hard to diagnose autism among the visually impaired, partly because some of its diagnostic criteria depend on vision, and partly because autistic symptoms overlap with those of common blindness syndromes or blindisms.
+
+Autism is one of the five pervasive developmental disorders (PDD), which are characterized by widespread abnormalities of social interactions and communication, and severely restricted interests and highly repetitive behavior. These symptoms do not imply sickness, fragility, or emotional disturbance.
+
+Of the five PDD forms, Asperger syndrome is closest to autism in signs and likely causes; Rett syndrome and childhood disintegrative disorder share several signs with autism, but may have unrelated causes; PDD not otherwise specified (PDD-NOS; also called "atypical autism") is diagnosed when the criteria are not met for a more specific disorder. Unlike with autism, people with Asperger syndrome have no substantial delay in language development. The terminology of autism can be bewildering, with autism, Asperger syndrome and PDD-NOS often called the "autism spectrum disorders" (ASD) or sometimes the "autistic disorders", whereas autism itself is often called "autistic disorder", "childhood autism", or "infantile autism". In this article, "autism" refers to the classic autistic disorder; in clinical practice, though, "autism", "ASD", and "PDD" are often used interchangeably. ASD, in turn, is a subset of the broader autism phenotype, which describes individuals who may not have ASD but do have autistic-like traits, such as avoiding eye contact.
+
+Autism can also be divided into syndromal and non-syndromal autism; the syndromal autism is associated with severe or profound intellectual disability or a congenital syndrome with physical symptoms, such as tuberous sclerosis. Although individuals with Asperger syndrome tend to perform better cognitively than those with autism, the extent of the overlap between Asperger syndrome, HFA, and non-syndromal autism is unclear.
+
+Some studies have reported diagnoses of autism in children due to a loss of language or social skills, as opposed to a failure to make progress, typically from 15 to 30 months of age. The validity of this distinction remains controversial; it is possible that regressive autism is a specific subtype, or that there is a continuum of behaviors between autism with and without regression.
+
+Research into causes has been hampered by the inability to identify biologically meaningful subgroups within the autistic population and by the traditional boundaries between the disciplines of psychiatry, psychology, neurology and pediatrics. Newer technologies such as fMRI and diffusion tensor imaging can help identify biologically relevant phenotypes (observable traits) that can be viewed on brain scans, to help further neurogenetic studies of autism; one example is lowered activity in the fusiform face area of the brain, which is associated with impaired perception of people versus objects. It has been proposed to classify autism using genetics as well as behavior.
+
+Autism has long been thought to cover a wide spectrum, ranging from individuals with severe impairments—who may be silent, developmentally disabled, and prone to frequent repetitive behavior such as hand flapping and rocking—to high functioning individuals who may have active but distinctly odd social approaches, narrowly focused interests, and verbose, pedantic communication. Because the behavior spectrum is continuous, boundaries between diagnostic categories are necessarily somewhat arbitrary. Sometimes the syndrome is divided into low-, medium- or high-functioning autism (LFA, MFA, and HFA), based on IQ thresholds. Some people have called for an end to the terms "high-functioning" and "low-functioning" due to lack of nuance and the potential for a person's needs or abilities to be overlooked.
+
+About half of parents of children with ASD notice their child's unusual behaviors by age 18 months, and about four-fifths notice by age 24 months. According to an article, failure to meet any of the following milestones "is an absolute indication to proceed with further evaluations. Delay in referral for such testing may delay early diagnosis and treatment and affect the long-term outcome".
+
+The United States Preventive Services Task Force in 2016 found it was unclear if screening was beneficial or harmful among children in whom there is no concerns. The Japanese practice is to screen all children for ASD at 18 and 24 months, using autism-specific formal screening tests. In contrast, in the UK, children whose families or doctors recognize possible signs of autism are screened. It is not known which approach is more effective. Screening tools include the Modified Checklist for Autism in Toddlers (M-CHAT), the Early Screening of Autistic Traits Questionnaire, and the First Year Inventory; initial data on M-CHAT and its predecessor, the Checklist for Autism in Toddlers (CHAT), on children aged 18–30 months suggests that it is best used in a clinical setting and that it has low sensitivity (many false-negatives) but good specificity (few false-positives). It may be more accurate to precede these tests with a broadband screener that does not distinguish ASD from other developmental disorders. Screening tools designed for one culture's norms for behaviors like eye contact may be inappropriate for a different culture. Although genetic screening for autism is generally still impractical, it can be considered in some cases, such as children with neurological symptoms and dysmorphic features.
+
+While infection with rubella during pregnancy causes fewer than 1% of cases of autism, vaccination against rubella can prevent many of those cases.
+
+The main goals when treating children with autism are to lessen associated deficits and family distress, and to increase quality of life and functional independence. In general, higher IQs are correlated with greater responsiveness to treatment and improved treatment outcomes. No single treatment is best and treatment is typically tailored to the child's needs. Families and the educational system are the main resources for treatment. Services should be carried out by behavior analysts, special education teachers, speech pathologists, and licensed psychologists. Studies of interventions have methodological problems that prevent definitive conclusions about efficacy. However, the development of evidence-based interventions has advanced in recent years. Although many psychosocial interventions have some positive evidence, suggesting that some form of treatment is preferable to no treatment, the methodological quality of systematic reviews of these studies has generally been poor, their clinical results are mostly tentative, and there is little evidence for the relative effectiveness of treatment options. Intensive, sustained special education programs and behavior therapy early in life can help children acquire self-care, communication, and job skills, and often improve functioning and decrease symptom severity and maladaptive behaviors; claims that intervention by around age three years is crucial are not substantiated. While medications have not been found to help with core symptoms, they may be used for associated symptoms, such as irritability, inattention, or repetitive behavior patterns.
+
+Educational interventions often used include applied behavior analysis (ABA), developmental models, structured teaching, speech and language therapy, social skills therapy, and occupational therapy. Among these approaches, interventions either treat autistic features comprehensively, or focalize treatment on a specific area of deficit. The quality of research for early intensive behavioral intervention (EIBI)—a treatment procedure incorporating over thirty hours per week of the structured type of ABA that is carried out with very young children—is currently low, and more vigorous research designs with larger sample sizes are needed. Two theoretical frameworks outlined for early childhood intervention include structured and naturalistic ABA interventions, and developmental social pragmatic models (DSP). One interventional strategy utilizes a parent training model, which teaches parents how to implement various ABA and DSP techniques, allowing for parents to disseminate interventions themselves. Various DSP programs have been developed to explicitly deliver intervention systems through at-home parent implementation. Despite the recent development of parent training models, these interventions have demonstrated effectiveness in numerous studies, being evaluated as a probable efficacious mode of treatment.
+
+Early, intensive ABA therapy has demonstrated effectiveness in enhancing communication and adaptive functioning in preschool children; it is also well-established for improving the intellectual performance of that age group. Similarly, a teacher-implemented intervention that utilizes a more naturalistic form of ABA combined with a developmental social pragmatic approach has been found to be beneficial in improving social-communication skills in young children, although there is less evidence in its treatment of global symptoms. Neuropsychological reports are often poorly communicated to educators, resulting in a gap between what a report recommends and what education is provided. It is not known whether treatment programs for children lead to significant improvements after the children grow up, and the limited research on the effectiveness of adult residential programs shows mixed results. The appropriateness of including children with varying severity of autism spectrum disorders in the general education population is a subject of current debate among educators and researchers.
+
+Medications may be used to treat ASD symptoms that interfere with integrating a child into home or school when behavioral treatment fails. They may also be used for associated health problems, such as ADHD or anxiety. More than half of US children diagnosed with ASD are prescribed psychoactive drugs or anticonvulsants, with the most common drug classes being antidepressants, stimulants, and antipsychotics. The atypical antipsychotic drugs risperidone and aripiprazole are FDA-approved for treating associated aggressive and self-injurious behaviors. However, their side effects must be weighed against their potential benefits, and people with autism may respond atypically. Side effects, for example, may include weight gain, tiredness, drooling, and aggression. SSRI antidepressants, such as fluoxetine and fluvoxamine, have been shown to be effective in reducing repetitive and ritualistic behaviors, while the stimulant medication methylphenidate is beneficial for some children with co-morbid inattentiveness or hyperactivity. There is scant reliable research about the effectiveness or safety of drug treatments for adolescents and adults with ASD. No known medication relieves autism's core symptoms of social and communication impairments. Experiments in mice have reversed or reduced some symptoms related to autism by replacing or modulating gene function, suggesting the possibility of targeting therapies to specific rare mutations known to cause autism.
+
+Although many alternative therapies and interventions are available, few are supported by scientific studies. Treatment approaches have little empirical support in quality-of-life contexts, and many programs focus on success measures that lack predictive validity and real-world relevance. Some alternative treatments may place the child at risk. The preference that children with autism have for unconventional foods can lead to reduction in bone cortical thickness with this being greater in those on casein-free diets, as a consequence of the low intake of calcium and vitamin D; however, suboptimal bone development in ASD has also been associated with lack of exercise and gastrointestinal disorders. In 2005, botched chelation therapy killed a five-year-old child with autism. Chelation is not recommended for people with ASD since the associated risks outweigh any potential benefits. Another alternative medicine practice with no evidence is CEASE therapy, a mixture of homeopathy, supplements, and 'vaccine detoxing'.
+
+Although popularly used as an alternative treatment for people with autism, as of 2018 there is no good evidence to recommend a gluten- and casein-free diet as a standard treatment. A 2018 review concluded that it may be a therapeutic option for specific groups of children with autism, such as those with known food intolerances or allergies, or with food intolerance markers. The authors analyzed the prospective trials conducted to date that studied the efficacy of the gluten- and casein-free diet in children with ASD (4 in total). All of them compared gluten- and casein-free diet versus normal diet with a control group (2 double-blind randomized controlled trials, 1 double-blind crossover trial, 1 single-blind trial). In two of the studies, whose duration was 12 and 24 months, a significant improvement in ASD symptoms (efficacy rate 50%) was identified. In the other two studies, whose duration was 3 months, no significant effect was observed. The authors concluded that a longer duration of the diet may be necessary to achieve the improvement of the ASD symptoms. Other problems documented in the trials carried out include transgressions of the diet, small sample size, the heterogeneity of the participants and the possibility of a placebo effect.
+
+In the subset of people who have gluten sensitivity there is limited evidence that suggests that a gluten-free diet may improve some autistic behaviors.
+
+There is tentative evidence that music therapy may improve social interactions, verbal communication, and non-verbal communication skills. There has been early research looking at hyperbaric treatments in children with autism. Studies on pet therapy have shown positive effects.
+
+There is no known cure. The degree of symptoms can decrease, occasionally to the extent that people lose their diagnosis of ASD; this occurs sometimes after intensive treatment and sometimes not. It is not known how often recovery happens; reported rates in unselected samples have ranged from 3% to 25%. Most children with autism acquire language by age five or younger, though a few have developed communication skills in later years. Many children with autism lack social support, future employment opportunities or self-determination. Although core difficulties tend to persist, symptoms often become less severe with age.
+
+Few high-quality studies address long-term prognosis. Some adults show modest improvement in communication skills, but a few decline; no study has focused on autism after midlife. Acquiring language before age six, having an IQ above 50, and having a marketable skill all predict better outcomes; independent living is unlikely with severe autism.
+
+Many individuals with autism face significant obstacles in transitioning to adulthood. Compared to the general population individuals with autism are more likely to be unemployed and to have never had a job. About half of people in their 20s with autism are not employed.
+
+Most recent reviews tend to estimate a prevalence of 1–2 per 1,000 for autism and close to 6 per 1,000 for ASD as of 2007. A 2016 survey in the United States reported a rate of 25 per 1,000 children for ASD. Globally, autism affects an estimated 24.8 million people , while Asperger syndrome affects a further 37.2 million. In 2012, the NHS estimated that the overall prevalence of autism among adults aged 18 years and over in the UK was 1.1%. Rates of PDD-NOS's has been estimated at 3.7 per 1,000, Asperger syndrome at roughly 0.6 per 1,000, and childhood disintegrative disorder at 0.02 per 1,000. CDC estimates about 1 out of 59 (1.7%) for 2014, an increase from 1 out of every 68 children (1.5%) for 2010.
+
+The number of reported cases of autism increased dramatically in the 1990s and early 2000s. This increase is largely attributable to changes in diagnostic practices, referral patterns, availability of services, age at diagnosis, and public awareness, though unidentified environmental risk factors cannot be ruled out. The available evidence does not rule out the possibility that autism's true prevalence has increased; a real increase would suggest directing more attention and funding toward changing environmental factors instead of continuing to focus on genetics.
+
+Boys are at higher risk for ASD than girls. The sex ratio averages 4.3:1 and is greatly modified by cognitive impairment: it may be close to 2:1 with intellectual disability and more than 5.5:1 without. Several theories about the higher prevalence in males have been investigated, but the cause of the difference is unconfirmed; one theory is that females are underdiagnosed.
+
+Although the evidence does not implicate any single pregnancy-related risk factor as a cause of autism, the risk of autism is associated with advanced age in either parent, and with diabetes, bleeding, and use of psychiatric drugs in the mother during pregnancy. The risk is greater with older fathers than with older mothers; two potential explanations are the known increase in mutation burden in older sperm, and the hypothesis that men marry later if they carry genetic liability and show some signs of autism. Most professionals believe that race, ethnicity, and socioeconomic background do not affect the occurrence of autism.
+
+Several other conditions are common in children with autism. They include:
+
+A few examples of autistic symptoms and treatments were described long before autism was named. The "Table Talk" of Martin Luther, compiled by his notetaker, Mathesius, contains the story of a 12-year-old boy who may have been severely autistic. Luther reportedly thought the boy was a soulless mass of flesh possessed by the devil, and suggested that he be suffocated, although a later critic has cast doubt on the veracity of this report. The earliest well-documented case of autism is that of Hugh Blair of Borgue, as detailed in a 1747 court case in which his brother successfully petitioned to annul Blair's marriage to gain Blair's inheritance. The Wild Boy of Aveyron, a feral child caught in 1798, showed several signs of autism; the medical student Jean Itard treated him with a behavioral program designed to help him form social attachments and to induce speech via imitation.
+
+The New Latin word "autismus" (English translation "autism") was coined by the Swiss psychiatrist Eugen Bleuler in 1910 as he was defining symptoms of schizophrenia. He derived it from the Greek word "autós" (αὐτός, meaning "self"), and used it to mean morbid self-admiration, referring to "autistic withdrawal of the patient to his fantasies, against which any influence from outside becomes an intolerable disturbance". A Soviet child psychiatrist, Grunya Sukhareva, described a similar syndrome that was published in Russian in 1925, and in German in 1926.
+
+The word "autism" first took its modern sense in 1938 when Hans Asperger of the Vienna University Hospital adopted Bleuler's terminology "autistic psychopaths" in a lecture in German about child psychology. Asperger was investigating an ASD now known as Asperger syndrome, though for various reasons it was not widely recognized as a separate diagnosis until 1981. Leo Kanner of the Johns Hopkins Hospital first used "autism" in its modern sense in English when he introduced the label "early infantile autism" in a 1943 report of 11 children with striking behavioral similarities. Almost all the characteristics described in Kanner's first paper on the subject, notably "autistic aloneness" and "insistence on sameness", are still regarded as typical of the autistic spectrum of disorders. It is not known whether Kanner derived the term independently of Asperger.
+
+Donald Triplett was the first person diagnosed with autism. He was diagnosed by Kanner after being first examined in 1938, and was labeled as "case 1". Triplett was noted for his savant abilities, particularly being able to name musical notes played on a piano and to mentally multiply numbers. His father, Oliver, described him as socially withdrawn but interested in number patterns, music notes, letters of the alphabet, and U.S. president pictures. By the age of 2, he had the ability to recite the 23rd Psalm and memorized 25 questions and answers from the Presbyterian catechism. He was also interested in creating musical chords.
+
+Kanner's reuse of "autism" led to decades of confused terminology like "infantile schizophrenia", and child psychiatry's focus on maternal deprivation led to misconceptions of autism as an infant's response to "refrigerator mothers". Starting in the late 1960s autism was established as a separate syndrome.
+
+As late as the mid-1970s there was little evidence of a genetic role in autism; while in 2007 it was believed to be one of the most heritable psychiatric conditions. Although the rise of parent organizations and the destigmatization of childhood ASD have affected how ASD is viewed, parents continue to feel social stigma in situations where their child's autistic behavior is perceived negatively, and many primary care physicians and medical specialists express some beliefs consistent with outdated autism research.
+
+It took until 1980 for the DSM-III to differentiate autism from childhood schizophrenia. In 1987, the DSM-III-R provided a checklist for diagnosing autism. In May 2013, the DSM-5 was released, updating the classification for pervasive developmental disorders. The grouping of disorders, including PDD-NOS, autism, Asperger syndrome, Rett syndrome, and CDD, has been removed and replaced with the general term of Autism Spectrum Disorders. The two categories that exist are impaired social communication and/or interaction, and restricted and/or repetitive behaviors.
+
+The Internet has helped autistic individuals bypass nonverbal cues and emotional sharing that they find difficult to deal with, and has given them a way to form online communities and work remotely. Societal and cultural aspects of autism have developed: some in the community seek a cure, while others believe that autism is simply another way of being.
+
+An autistic culture has emerged, accompanied by the autistic rights and neurodiversity movements. Events include World Autism Awareness Day, Autism Sunday, Autistic Pride Day, Autreat, and others. Organizations dedicated to promoting awareness of autism include Autistic Self Advocacy Network, Aspies For Freedom, Autism National Committee, and Autism Society of America. At the same time, some organizations, including Autism Speaks, have been condemned by disability rights organizations for failing to support autistic people. Social-science scholars study those with autism in hopes to learn more about "autism as a culture, transcultural comparisons... and research on social movements." While most autistic individuals do not have savant skills, many have been successful in their fields.
+
+The autism rights movement is a social movement within the context of disability rights that emphasizes the concept of neurodiversity, viewing the autism spectrum as a result of natural variations in the human brain rather than a disorder to be cured. The autism rights movement advocates for including greater acceptance of autistic behaviors; therapies that focus on coping skills rather than on imitating the behaviors of those without autism, and the recognition of the autistic community as a minority group. Autism rights or neurodiversity advocates believe that the autism spectrum is genetic and should be accepted as a natural expression of the human genome. This perspective is distinct from two other likewise distinct views: the medical perspective, that autism is caused by a genetic defect and should be addressed by targeting the autism gene(s), and fringe theories that autism is caused by environmental factors such as vaccines. A common criticism against autistic activists is that the majority of them are "high-functioning" or have Asperger syndrome and do not represent the views of "low-functioning" autistic people. 
+
+About half of autistics are unemployed, and one third of those with graduate degrees may be unemployed. Among autistics who find work, most are employed in sheltered settings working for wages below the national minimum. While employers state hiring concerns about productivity and supervision, experienced employers of autistics give positive reports of above average memory and detail orientation as well as a high regard for rules and procedure in autistic employees. A majority of the economic burden of autism is caused by decreased earnings in the job market. Some studies also find decreased earning among parents who care for autistic children.
+
+
+</doc>
--- a/tests/fixtures/tests_samples/wmt16/sample.json
+++ b/tests/fixtures/tests_samples/wmt16/sample.json
@@ -0,0 +1,10 @@
+{"translation": {"en": "Membership of Parliament: see Minutes", "ro": "Componenţa Parlamentului: a se vedea procesul-verbal"}}
+{"translation": {"en": "Approval of Minutes of previous sitting: see Minutes", "ro": "Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal"}}
+{"translation": {"en": "Membership of Parliament: see Minutes", "ro": "Componenţa Parlamentului: a se vedea procesul-verbal"}}
+{"translation": {"en": "Verification of credentials: see Minutes", "ro": "Verificarea prerogativelor: a se vedea procesul-verbal"}}
+{"translation": {"en": "Documents received: see Minutes", "ro": "Depunere de documente: a se vedea procesul-verbal"}}
+{"translation": {"en": "Written statements and oral questions (tabling): see Minutes", "ro": "Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal"}}
+{"translation": {"en": "Petitions: see Minutes", "ro": "Petiţii: a se vedea procesul-verbal"}}
+{"translation": {"en": "Texts of agreements forwarded by the Council: see Minutes", "ro": "Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal"}}
+{"translation": {"en": "Action taken on Parliament's resolutions: see Minutes", "ro": "Cursul dat rezoluţiilor Parlamentului: a se vedea procesul-verbal"}}
+{"translation": {"en": "Agenda for next sitting: see Minutes", "ro": "Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal"}}
--- a/tests/fixtures/tests_samples/wmt_en_ro/test.json
+++ b/tests/fixtures/tests_samples/wmt_en_ro/test.json
@@ -0,0 +1,20 @@
+{ "translation": { "en": "UN Chief Says There Is No Military Solution in Syria Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that \"there is no military solution\" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people. The U.N. chief again urged all parties, including the divided U.N. Security Council, to unite and support inclusive negotiations to find a political solution. Ban told a news conference Wednesday that he plans to meet with foreign ministers of the five permanent council nations - the U.S., Russia, China, Britain and France - on the sidelines of the General Assembly's ministerial session later this month to discuss Syria.", "ro": "Șeful ONU declară că nu există soluții militare în Siria Secretarul General Ban Ki-moon afirmă că răspunsul său la suportul militar al Rusiei pentru Siria este că „nu există o soluție militară” la conflictul care durează de aproape cinci ani iar mai multe arme nu ar face decât să agraveze violența și suferința a milioane de oameni. Șeful ONU a solicitat din nou tuturor părților, inclusiv Consiliului de securitate ONU divizat să se unifice și să susțină negocierile pentru a găsi o soluție politică. Ban a declarat miercuri în cadrul unei conferințe că intenționează să se întâlnească luna aceasta cu miniștrii de externe din cinci țări permanent prezente în consiliu - SUA, Rusia, China, Anglia și Franța - pe marginea sesiunii ministeriale a Adunării Generale pentru a discuta despre Siria." } }
+{ "translation": { "en": "He expressed regret that divisions in the council and among the Syrian people and regional powers \"made this situation unsolvable.\" Ban urged the five permanent members to show the solidarity and unity they did in achieving an Iran nuclear deal in addressing the Syria crisis. 8 Poll Numbers That Show Donald Trump Is For Real Some have tried to label him a flip-flopper. Others have dismissed him as a joke. And some are holding out for an implosion. But no matter how some Republicans are trying to drag Donald Trump down from atop the polls, it hasn't worked (yet).", "ro": "Ban și-a exprimat regretul că divizările în consiliu și între poporul sirian și puterile regionale „au făcut această situație de nerezolvat”. Ban le-a cerut celor cinci membri permanenți să dea dovadă de solidaritatea și unitatea arătate atunci când au reușit să încheie un acord referitor la armele nucleare ale Iranului, abordând astfel criza din Siria. 8 cifre din sondaje care arată că Donald Trump are șanse reale Unii au încercat să îl eticheteze ca politician „flip-flop”. Alții l-au numit o glumă. Iar alții așteaptă implozia. Însă indiferent de modul în care unii republicani încearcă să îl dărâme pe Donald Trump din vârful sondajelor, nu a funcționat (încă)." } }
+{ "translation": { "en": "Ten of the last 11 national polls have shown Donald Trump's lead at double digits, and some are starting to ask seriously what it means for the real estate mogul's nomination chances. Of course, it's still early in the election cycle. None of this is to say that Trump is likely to win the Republican nomination. Pundits point out that at this time in 2011, Rick Perry's lead was giving way to a rising Herman Cain, neither of whom won even one state in the nomination process. And there are many reasons he would struggle in a general election. But outside groups like Jeb Bush's Super PAC and the economic conservative group Club for Growth are recognizing Trump's staying power and beginning to unload their dollars to topple him.", "ro": "Zece din ultimele 11 sondaje naționale au arătat că Donald Trump conduce cu un procent din două cifre iar unele voci încep să se întrebe serios ce înseamnă acest lucru pentru șansele de numire ale mogulului imobiliar. Desigur, este încă prematur. Nimic din toate acestea nu spune că Trump va câștiga cursa pentru nominalizarea republicanilor. Pundits arată că, în aceeași perioadă a anului 2011, avansul lui Rick Perry îi făcea loc lui Herman Cain în sondaje, dar niciunul dintre ei nu a câștigat în vreun stat în cursa de nominalizare. Iar motivele pentru care s-ar lupta din greu la alegerile generale sunt numeroase. Însă grupurile din exterior precum Super PAC al lui Jeb Bush și grupul conservator economic Club for Growth admit puterea lui Trump și încep să îl susțină cu bani." } }
+{ "translation": { "en": "Here are some recent poll numbers that suggest that the real estate mogul isn't just a passing phase: Trump's favorability ratings have turned 180 degrees. Right before Donald Trump announced his candidacy in mid-June, a Monmouth University poll showed only two in 10 Republicans had a positive view of the real estate mogul. By mid-July, it was 40 percent. In early August, it was 52 percent. Now, six in 10 Republicans have a favorable view of Donald Trump. Roughly three in 10 say they have a negative view. And these numbers hold up in early states. A Quinnipiac poll in Iowa last week found that 60 percent of Republicans there had a favorable view of Trump.", "ro": "În continuare vă prezentăm câteva cifre din sondaje recente care sugerează că mogulul imobiliar nu este doar ceva trecător: Cifrele care indică susținerea față de Trump s-au întors la 180 grade. Chiar înainte ca Donald Trump să își anunțe candidatura, la mijlocul lui iunie, un sondaj realizat de Universitatea din Monmouth arăta că doar doi din 10 republicani aveau o părere pozitivă despre mogulul imobiliar. Până la mijlocul lui iulie, procentul a urcat la 40%. La începutul lui august, era 52%. În prezent, șase din 10 republicani au o părere favorabilă despre Donald Trump. Aproximativ trei din 10 declară că au o părere negativă. Aceste cifre se mențin. Un sondaj realizat săptămâna trecută de Quinnipiac în Iowa a concluzionat că 60% dintre republicanii din regiune au o părere favorabilă despre Trump." } }
+{ "translation": { "en": "Two-thirds of GOP voters would be happy with Trump as the nominee. In a CNN/ORC poll last week, 67 percent of Republicans said they would be either \"enthusiastic\" or \"satisfied\" if Trump were the nominee. Only two in 10 say they would be \"upset\" if he were the nominee. Only Ben Carson generates roughly the same level of enthusiasm as Trump (43 percent say they would be \"enthusiastic\" vs. 40 percent who say the same of Trump). The next closest in enthusiasm? Marco Rubio with only 21 percent.", "ro": "Două treimi dintre alegătorii GOP ar fi fericiți dacă Trump ar câștiga cursa pentru nominalizare. Într-un sondaj realizat săptămâna trecută de CNN/ORC, 67% dintre republicani au declarat că ar fi „entuziasmați” sau „mulțumiți” dacă Trump ar câștiga cursa pentru nominalizare. Doar doi din 10 declară că ar fi „supărați” dacă Trump ar câștiga cursa pentru nominalizare. Doar Ben Carson generează aproximativ același nivel de entuziasm ca Trump (43% declară că ar fi „entuziasmați” față de 40% care declară același lucru despre Trump). Cel mai aproape în ceea ce privește entuziasmul? Marco Rubio, cu doar 21%." } }
+{ "translation": { "en": "On the flip side, 47 percent of Republican voters say they would be \"dissatisfied\" or \"upset\" if establishment favorite Jeb Bush becomes the nominee. A majority of Republicans don't see Trump's temperament as a problem. While Donald Trump has been widely criticized for his bombast and insults, 52 percent of leaned Republican voters nationwide think that the real estate mogul has the right temperament to be president, according to Monday's ABC News/Washington Post poll. The same number holds in the first-in-the-nation caucus state of Iowa, where the same 52 percent of Republicans think he has the personality to be commander in chief, according to Quinnipiac last week.", "ro": "De partea cealaltă, 47% dintre alegătorii republicani afirmă că ar fi „nemulțumiți” sau „supărați” dacă favoritul Jeb Bush câștigă cursa pentru nominalizare. Majoritatea republicanilor nu consideră temperamentul lui Trump o problemă. Deși Donald Trump a fost puternic criticat pentru insultele aduse și stilul său bombastic, 52% dintre alegătorii republicani la nivel național consideră că mogulul imobiliar are temperamentul potrivit pentru a fi președinte, conform sondajului realizat luni de ABC News/Washington Post. Regăsim aceleași cifre în statul Iowa, unde tot 52% dintre republicani cred că Trump are personalitatea potrivită pentru a fi conducător, conform sondajului realizat săptămâna trecută de Quinnipiac." } }
+{ "translation": { "en": "Still, 44 percent think he doesn't have the personality to serve effectively, and almost six in 10 independents say his temperament does not belong in the White House, according to ABC/Post. Republican voters are getting used to the idea. When they put on their pundit hats, Republican voters think Trump is for real. When asked who is most likely to win the GOP nomination, four in 10 said Trump was the best bet, according to a CNN/ORC poll out last week. That's a change from when four in 10 placed their money on Jeb Bush in late July. Full disclosure: GOP voters haven't had the clearest crystal ball in the past.", "ro": "Totuși, 44% sunt de părere că nu are personalitatea necesară pentru a acționa eficient și aproape șase din 10 independenți afirmă că temperamentul său nu are ce căuta la Casa Albă, conform ABC/Post. Alegătorii republicani se obișnuiesc cu ideea. Atunci când iau atitudinea de intelectuali, alegătorii republicani consideră că Trump este autentic. Conform unui sondaj realizat săptămâna trecută de CNN/ORC, la întrebarea cine are cele mai multe șanse să câștige cursa pentru nominalizare GOP, patru din 10 au declarat că Trump. Situația s-a schimbat față de finalul lui iulie, când patru din 10 ar fi pariat pe Jeb Bush. Informare completă: în trecut, alegătorii GOP nu au citit foarte bine viitorul." } }
+{ "translation": { "en": "At this time last cycle, four in 10 Republicans picked Rick Perry to win the nomination, vs. only 28 percent for eventual nominee Mitt Romney. Still, it shows that a plurality of GOP voters see Trump's campaign as plausible. Even if Republicans rallied around another candidate, Trump still beats almost everyone. Some pundits point out that the splintered field is likely contributing to Trump's lead, while anti-Trump support is be spread diffusely among more than a dozen other candidates. But a Monmouth University poll in early September shows that, in a hypothetical head-to-head matchup between Trump and most other Republican candidates, Trump almost always garners majority support.", "ro": "În aceeași perioadă a ultimelor alegeri, patru din 10 republicani l-au ales pe Rick Perry în cursa pentru nominalizare, față de doar 28% pentru Mitt Romney. Însă, aceste cifre arată că majoritatea alegătorilor GOP consideră plauzibilă campania lui Trump. Chiar dacă republicanii sau repliat spre un alt candidat. Trump încă se află în fruntea tuturor. Unele voci spun că situația divizată va contribui probabil la victoria lui Trump, în timp ce susținerea contra lui Trump se va împărți la mai mult de doisprezece candidați. Însă un sondaj derulat la începutul lui septembrie de Universitatea din Monmouth arată că, în situația ipotetică a unei colaborări între Trump și majoritatea celorlalți candidați republicani, aproape întotdeauna Trump va beneficia de susținerea majoritară." } }
+{ "translation": { "en": "He leads Carly Fiorina by 13 points, Marco Rubio by 14 points, Walker by 15 points, Jeb Bush by 19 points, and, finally, Rand Paul, John Kasich and Chris Christie by 33 points each. He's in a dead heat with Ted Cruz. The only candidate who beats him? Ben Carson would lead the businessman by a wide 19 points in a hypothetical head-to-head. A bare majority of Donald Trump's supporters say they've made up their minds. A new CBS/NYT poll out on Tuesday shows that just more than half of voters who support Trump say they have locked in their votes. Obviously, a lot can happen to change that, and no one can really say they would never change their mind.", "ro": "Trump se află la distanță de 13 puncte de Carly Fiorina, la 14 puncte de Marco Rubio, la 15 puncte de Walker, la 19 puncte de Jeb Bush și, în cele din urmă, la câte 33 de puncte față de Rand Paul, John Kasich și Chris Christie. Este aproape la egalitate cu Ted Cruz. Singurul candidat care îl învinge? Ben Carson l-ar învinge pe omul de afaceri cu 19 puncte într-o confruntare ipotetică de unu la unu. Majoritatea susținătorilor lui Donald Trump declară că s-au decis. Un nou sondaj realizat marți de CBS/NYT arată că peste jumătate dintre alegătorii care îl susțin pe Trump declară că nu își schimbă opțiunea de vot. Evident, se pot întâmpla multe în acest sens și nimeni nu poate spune că aceștia nu se vor răzgândi niciodată." } }
+{ "translation": { "en": "46 percent said they are leaving the door open to switching candidates. Still, Trump's strongest competition at the moment is from fellow outsider neurosurgeon Ben Carson, but voters who say they have made up their minds are twice as likely to go for Trump. Six in 10 Republicans say they agree with Trump on immigration. Even since Donald Trump called immigrants from Mexico \"rapists\" in his campaign announcement speech two months ago, immigration has been front and center in the 2016 conversation. Some are worried that Trump's bombast will drive crucial Hispanic voters away from the Republican Party and damage rebranding efforts.", "ro": "46% afirmă că lasă portița deschisă posibilității de a-și schimba opțiunea. Cu toate acestea, cel mai important adversar al lui Trump este în prezent neurochirurgul Ben Carson, însă este de două ori mai probabil ca alegătorii care declară că s-au decis să voteze cu Trump. Șase din 10 republicani afirmă că sunt de acord cu Trump în problema imigrării. De când Donald Trump i-a numit pe imigranții din Mexic „violatori” în discursul de deschidere a campaniei sale, în urmă cu două luni, imigrarea a fost subiectul central în campania pentru 2016. Unii sunt îngrijorați că stilul bombastic al lui Trump va duce la o scindare între alegătorii hispanici importanți și Partidul Republican și va prejudicia eforturile de rebranding." } }
+{ "translation": { "en": "But according to Monday's new ABC/Post poll, six in 10 Republicans say they agree with Trump on immigration issues. So as long as immigration remains in the spotlight, it seems Donald Trump will remain too. Frustration with government is climbing to new highs. Donald Trump and Ben Carson now account for roughly half of the support from Republican voters, largely due to their outsider status. Six in 10 Republicans in Monday's new ABC/Post poll say they want a political outsider over someone with government experience. And they are angry at Washington, too.", "ro": "Însă, conform sondajului realizat luni de ABC/Post, șase din 10 republicani afirmă că sunt de acord cu Trump în problema imigrării. Așa că, se pare că atâta timp cât problema imigrării rămâne în lumina reflectoarelor, la fel va rămâne și Doland Trump. Frustrarea față de autorități atinge noi culmi. Donald Trump și Ben Carson sunt acum susținuți de aproape jumătate dintre alegătorii republicani, în mare parte datorită statutului lor de outsideri. Conform sondajului realizat luni de ABC/Post, șase din 10 republicani afirmă că preferă un outsider politic în detrimentul cuiva cu experiență în guvernare. Oamenii sunt de asemenea supărați pe autoritățile de la Washington." } }
+{ "translation": { "en": "A Des Moines Register/Bloomberg poll in Iowa from two weeks ago shows that three in four Iowa Republicans are frustrated with Republicans in Congress, with 54 percent \"unsatisfied\" and 21 percent \"mad as hell.\" Jeremy Corbyn to make debut at Prime Minister's Questions Since his election, Mr Corbyn's debut at PMQs has been keenly awaited New Labour leader Jeremy Corbyn is to make his debut at Prime Minister's Questions later, taking on David Cameron for the first time.", "ro": "Un sondaj derulat în urmă cu două săptămâni în Iowa de către Des Moines Register/Bloomberg arată că trei din patru republicani din Iowa sunt frustrați de prestația republicanilor din COngres, 54% declarându-se „nemulțumiți” iar 21% „nervoși la culme”. Jeremy Corbyn își face debutul la Prime Minister's Questions Încă de la alegerea sa, debutul domnului Corbyn la PMQs a fost îndelung așteptat Noul lider al Partidului Laburist, Jeremy Corbyn, își va face mai târziu debutul la Prime Minister's Questions, confruntându-se pentru prima dată cu David Cameron." } }
+{ "translation": { "en": "Mr Corbyn will rise to ask the first of his six allotted questions shortly after midday, with his performance likely to be closely scrutinised by the media and Labour MPs. He has called for \"less theatre and more facts\" at the weekly showpiece. He has also said he could skip some sessions, leaving them to colleagues. The encounter will be the first parliamentary test of Mr Corbyn's leadership, coming after his appointment of a shadow cabinet and his speech to the TUC annual congress on Tuesday.", "ro": "Dl Corbyn va adresa primele dintre cele șase întrebări la care are dreptul la scurt timp după prânz; prestația sa va fi probabil analizată îndeaproape de mass-media și parlamentarii laburiști. În cadrul aparițiilor săptămânale, el a cerut „mai puțin teatru și mai multe fapte”. A declarat de asemenea că poate renunța la câteva participări și că le cedează colegilor săi. Confruntarea va fi primul test parlamentar al Dl Corbyn în poziție de lider, venind după ce a numit un „cabinet fantomă” și după discursul pe care l-a ținut marți la congresul anual TUC." } }
+{ "translation": { "en": "Meanwhile, the Labour leader's decision to stand in silence during the singing of the national anthem at a service on Tuesday to mark the 75th anniversary of the Battle of Britain has attracted criticism from a number of Tory MPs and is the focus of several front page stories in the newspapers. Mr Corbyn's decision not to sing the national anthem has attracted attention A spokesman for Mr Corbyn said he had \"stood in respectful silence\" and did recognise the \"heroism of the Royal Air Force in the Battle of Britain.\"", "ro": "Între timp, decizia liderului Partidului laburist de a păstra tăcerea la rostirea imnului național în cadrul unei slujbe ținute marți cu ocazia aniversării a 75 de ani de la Bătălia Angliei a atras critici din partea unor parlamentari conservatori și a ținut prima pagină a ziarelor. Decizia domnului Corbyn de a nu cânta imnul național a atras atenția Un purtător de cuvânt al Dl Corbyn a declarat că acesta „a păstrat tăcerea în mod respectuos” și a recunoscut „eroismul Forțelor aeriene britanice în Bătălia Angliei.”" } }
+{ "translation": { "en": "But a member of Mr Corbyn's shadow cabinet, Owen Smith, told BBC Two's Newsnight programme he would have advised the Labour leader to sing the national anthem \"irrespective\" of his belief that the monarchy should be abolished. Nearly a dozen shadow ministers have refused to serve in Mr Corbyn's top team, citing differences over the economy, defence and foreign affairs, while less than a sixth of the parliamentary party originally backed him as leader. BBC political correspondent Robin Brant says policy differences are also \"stacking up\" within Labour following Mr Corbyn's appointment over its position on the European Union and the government's cap on benefits.", "ro": "Însă un membru al cabinetului fantomă al Dl Corbyn, Owen Smith, a declarat pentru emisiunea Two's Newsnight transmisă de BBC că i-ar fi recomandat liderului laburist să cânte imnul național „indiferent” de credința sa că monarhia ar trebui abolită. În jur de doisprezece miniștri din cabinetul fantomă au refuzat să facă parte din echipa de frunte a Dl Corbyn, argumentând prin diferențe de opinie legate de economie, apărare și externe, în timp ce mai puțin de o șesime din partidul parlamentar l-a susținut ca lider. Corespondentul politic al BBC, Robin Brant, declară că diferențele de politică „se cumulează” în Partidul Laburist după numirea domnului Corbyn referitor la poziția sa față de Uniunea Europeană și limita de beneficii." } }
+{ "translation": { "en": "Mr Corbyn told the TUC conference Labour was putting forward amendments to remove the whole idea of a cap altogether. Hours later Mr Smith, the shadow work and pensions secretary, said the party was \"very clear\" that it was only opposing government plans to reduce the level of cap from £26,000 to £23,000. Mr Corbyn will be the fifth Labour leader that David Cameron has faced across the despatch box over the past decade since he became Tory leader. The Labour leader, who has promised a different approach to politics, says he has \"crowd sourced\" ideas for questions to ask Mr Cameron and has been given more than 30,000 suggestions.", "ro": "Dl Corbyn a declarat la conferința TUC că Partidul Laburist va aduce modificări prin care se va elimina integral ideea limitării. Câteva ore mai târziu, Dl Smith, Ministrul Muncii și Pensiilor, a declarat că partidul „este foarte clar” în opoziția exclusivă față de planurile guvernului de a reduce nivelul „cap” de la 26.000 lire la 23.000 lire. Dl Corbyn va fi al cincilea lider laburist cu care se confruntă David Cameron la tribună în ultimul deceniu, de când a preluat conducerea Partidului Conservator. Liderul laburist, care a promis o abordare diferită a politicii, spune că are idei „din surse externe” pentru întrebări pe care să i le adreseze Domnului Cameron și că a primit peste 30.000 de sugestii." } }
+{ "translation": { "en": "The Islington North MP has said PMQs is too confrontational and that he will refrain from both \"repartee\" and trading barbs, instead vowing to focus on serious issues such as poverty, inequality and the challenges facing young people. Mr Corbyn has said that Angela Eagle, the shadow business secretary, will deputise for him at PMQs when he does not attend - for instance when Mr Cameron is travelling abroad. He has also floated the idea of allowing other colleagues to take the floor on occasion, saying he had approached the Commons Speaker John Bercow to discuss the issue.", "ro": "Parlamentarul Islington North a afirmat că PMQs implică un nivel de confruntare prea înalt și că se va abține de la replici și atacuri, angajându-se să se concentreze în schimb pe probleme serioase precum sărăcia, inegalitatea și provocările cu care se confruntă tinerii. Dl Corbyn a declarat că Angela Eagle, Ministrul de finanțe, îi va ține locul la PMQs atunci când el nu poate participa - de exemplu atunci când Dl Cameron se deplasează în străinătate. A exprimat de asemenea ideea că va permite altor colegi să ia cuvântul ocazional, spunând că l-a abordat pe Președintele Camerei Deputaților, John Bercow, pentru a discuta acest aspect." } }
+{ "translation": { "en": "When he became leader in 2005, Mr Cameron said he wanted to move away from the \"Punch and Judy\" style of politics often associated with PMQs but admitted some years later that he had failed. Since it was first televised in 1990, PMQs has been seen as a key barometer of a leader's judgement, their command of the Commons and their standing among their fellow MPs although critics have argued it has become a caricature and is in need of far-reaching reforms. 'Shot in Joburg': Homeless youth trained as photographers Downtown Johannesburg is a tough place to be homeless.", "ro": "În 2005, când a preluat conducerea, Dl Cameron a declarat că dorește să renunțe la stilul politic „Punch and Judy” asociat adesea cu PMQs însă a recunoscut câțiva ani mai târziu că nu a reușit în demersul său. De la prima transmisie, în 1990, PMQs a fost considerată un barometru cheie al raționamentului unui lider, al modului în care acesta conduce Camera Deputaților și a poziției sale în rândul colegilor parlamentari, deși criticii afirmă a ca devenit o caricatură și că are nevoie de o reformare profundă. „Cadru în Joburg”: Tineri fără adăpost beneficiază de cursuri de fotografie Este dificil să fii un om fără adăpost în Johannesburg." } }
+{ "translation": { "en": "But one group of former street children have found a way to learn a skill and make a living. \"I was shot in Joburg\" is a non-profit studio that teaches homeless youngsters how to take photographs of their neighbourhood and make a profit from it. BBC News went to meet one of the project's first graduates. JD Sports boss says higher wages could hurt expansion JD Sports Executive Chairman Peter Cowgill says a higher minimum wage for UK workers could mean \"more spending power in the pockets of potential consumers.\" But that spending power is unlikely to outweigh the higher labour costs at his firm, he says.", "ro": "Însă un grup de oameni care au trăit pe străzi în copilărie au găsit un mod de a învăța o meserie și de a-și câștiga traiul. „I was shot în Joburg” este un studio non-profit care îi învață pe tinerii fără adăpost să facă fotografii ale zonelor în care trăiesc și să câștige bani din asta. BBC News s-a întâlnit cu unul dintre primii absolvenți ai proiectului. Șeful JD Sports spune că salariile mai mari ar putea dăuna extinderii Președintele JD Sports, Peter Cowgill, declară că o creștere a salariului minim în Marea Britanie ar putea însemna „o putere de cumpărare mai mare în buzunarele potențialilor consumatori.” Este însă puțin probabil ca respectiva putere de cumpărare să depășească costurile mai mari pentru forța de muncă în cadrul firmei, afirmă el." } }
+{ "translation": { "en": "The costs could hit JD Sports' expansion plans, he added, which could mean fewer extra jobs. Thanasi Kokkinakis backed by Tennis Australia president Steve Healy Thanasi Kokkinakis deserves kudos rather than criticism for his behaviour. Thanasi Kokkinakis has been the collateral damage in the recent storm around his friend Nick Kyrgios and deserves kudos rather than criticism for his own behaviour, according to Tennis Australia president Steve Healy.", "ro": "Costurile ar putea avea impact asupra planurilor de extindere ale JD Sports, a adăugat el, ceea ce ar putea însemna mai puține locuri de muncă noi. Thanasi Kokkinakis susținut de președintele Tennis Australia, Steve Healy Thanasi Kokkinakis ar merita să fie lăudat și nu criticat pentru comportamentul său. Thanasi Kokkinakis a fost victimă colaterală în „furtuna” creată în jurul prietenului său, Nick Kyrgios, iar comportamentul său merită mai degrabă cuvinte de laudă și nu critică, în opinia președintelui Tennis Australia, Steve Healy." } }
--- a/tests/fixtures/tests_samples/wmt_en_ro/train.json
+++ b/tests/fixtures/tests_samples/wmt_en_ro/train.json
@@ -0,0 +1,11 @@
+{ "translation": { "en": "Corrections to votes and voting intentions: see Minutes Assignment conferred on a Member: see Minutes Membership of committees and delegations: see Minutes Decisions concerning certain documents: see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes", "ro": "Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Misiune încredinţată unui deputat: consultaţi procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Membership of Parliament: see Minutes Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes Verification of credentials: see Minutes Documents received: see Minutes Written statements and oral questions (tabling): see Minutes Petitions: see Minutes Texts of agreements forwarded by the Council: see Minutes Action taken on Parliament's resolutions: see Minutes Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 7.45 p.m.)", "ro": "Componenţa Parlamentului: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal Verificarea prerogativelor: a se vedea procesul-verbal Depunere de documente: a se vedea procesul-verbal Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal Petiţii: a se vedea procesul-verbal Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal Cursul dat rezoluţiilor Parlamentului: a se vedea procesul-verbal Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Se levanta la sesión a las 19.45 horas)" } }
+{ "translation": { "en": "Election of Vice-Presidents of the European Parliament (deadline for submitting nominations): see Minutes (The sitting was suspended at 12.40 p.m. and resumed at 3.00 p.m.) Election of Quaestors of the European Parliament (deadline for submitting nominations): see Minutes (The sitting was suspended at 3.25 p.m. and resumed at 6.00 p.m.) Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 6.15 p.m.) Opening of the sitting (The sitting was opened at 9.35 a.m.) Documents received: see Minutes Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes", "ro": "Alegerea vicepreşedinţilor Parlamentului European (termenul de depunere a candidaturilor): consultaţi procesul-verbal (Die Sitzung wird um 12.40 Uhr unterbrochen und um 15.00 Uhr wiederaufgenommen). Alegerea chestorilor Parlamentului European (termenul de depunere a candidaturilor): consultaţi procesul-verbal (Die Sitzung wird um 15.25 Uhr unterbrochen und um 18.00 Uhr wiederaufgenommen). Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Die Sitzung wird um 18.15 Uhr geschlossen.) Deschiderea şedinţei (Die Sitzung wird um 9.35 Uhr eröffnet.) Depunerea documentelor: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Membership of committees (deadline for tabling amendments): see Minutes (The sitting was suspended at 7 p.m. and resumed at 9 p.m.) Agenda for next sitting: see Minutes Closure of sitting (The sitting was suspended at 23.25 p.m.) Documents received: see Minutes Communication of Council common positions: see Minutes (The sitting was suspended at 11.35 a.m. and resumed for voting time at noon) Approval of Minutes of previous sitting: see Minutes Committee of Inquiry into the crisis of the Equitable Life Assurance Society (extension of mandate): see Minutes", "ro": "Componenţa comisiilor (termenul de depunere a amendamentelor): consultaţi procesul-verbal (La seduta, sospesa alle 19.00, è ripresa alle 21.00) Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Die Sitzung wird um 23.25 Uhr geschlossen.) Depunerea documentelor: a se vedea procesul-verbal Comunicarea poziţiilor comune ale Parlamentului: a se vedea procesul-verbal (La séance, suspendue à 11h35 dans l'attente de l'Heure des votes, est reprise à midi) Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Comisia de anchetă privind criza societăţii de asigurări \"Equitable Life” (prelungirea mandatului): consultaţi procesul-verbal" } }
+{ "translation": { "en": "Announcement by the President: see Minutes 1. Membership of committees (vote) 2. Amendment of the ACP-EC Partnership Agreement (vote) 4. Certification of train drivers operating locomotives and trains on the railway system in the Community (vote) 6. Law applicable to non-contractual obligations (\"ROME II\") (vote) 8. Seventh and eighth annual reports on arms exports (vote) Corrections to votes and voting intentions: see Minutes Membership of committees and delegations: see Minutes Request for waiver of parliamentary immunity: see Minutes Decisions concerning certain documents: see Minutes", "ro": "Comunicarea Preşedintelui: consultaţi procesul-verbal 1. Componenţa comisiilor (vot) 2. Modificarea Acordului de parteneriat ACP-CE (\"Acordul de la Cotonou”) (vot) 4. Certificarea mecanicilor de locomotivă care conduc locomotive şi trenuri în sistemul feroviar comunitar (vot) 6. Legea aplicabilă obligaţiilor necontractuale (\"Roma II”) (vot) 8. Al şaptelea şi al optulea raport anual privind exportul de armament (vot) Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Cerere de ridicare a imunităţii parlamentare: consultaţi procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Written statements for entry", "ro": "Declaraţii scrise înscrise" } }
+{ "translation": { "en": "Written statements for entry in the register (Rule 116): see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes Adjournment of the session I declare the session of the European Parliament adjourned. (The sitting was closed at 1 p.m.) Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes Request for the defence of parliamentary immunity: see Minutes Appointments to committees (proposal by the Conference of Presidents): see Minutes Documents received: see Minutes Texts of agreements forwarded by the Council: see Minutes", "ro": "Declaraţii scrise înscrise în registru (articolul 116 din Regulamentul de procedură): a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal Întreruperea sesiunii Dichiaro interrotta la sessione del Parlamento europeo. (La seduta è tolta alle 13.00) Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal Cerere de apărare a imunităţii parlamentare: consultaţi procesul-verbal Numiri în comisii (propunerea Conferinţei preşedinţilor): consultaţi procesul-verbal Depunerea documentelor: a se vedea procesul-verbal Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Action taken on Parliament's resolutions: see Minutes Oral questions and written statements (tabling): see Minutes Written statements (Rule 116): see Minutes Agenda: see Minutes 1. Appointments to parliamentary committees (vote): see Minutes Voting time Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 12 midnight) Opening of the sitting (The sitting was opened at 09.05) Documents received: see Minutes Approval of Minutes of previous sitting: see Minutes 1. Protection of passengers against displaced luggage (vote) 2.", "ro": "Continuări ale rezoluţiilor Parlamentului: consultaţi procesul-verbal Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal Declaraţii scrise (articolul 116 din Regulamentul de procedură) Ordinea de zi: a se vedea procesul-verbal 1. Numiri în comisiile parlamentare (vot): consultaţi procesul-verbal Timpul afectat votului Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (La seduta è tolta alle 24.00) Deschiderea şedinţei (The sitting was opened at 09.05) Depunerea documentelor: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal 1. Protecţia pasagerilor împotriva deplasării bagajelor (vot) 2." } }
+{ "translation": { "en": "Approval of motor vehicles with regard to the forward field of vision of the driver (vote) 3. EC-Korea Agreement on scientific and technological cooperation (vote) 4. Mainstreaming sustainability in development cooperation policies (vote) 5. Draft Amending Budget No 1/2007 (vote) 7. EC-Gabon Fisheries Partnership (vote) 10. Limitation periods in cross-border disputes involving personal injuries and fatal accidents (vote) 12. Strategy for a strengthened partnership with the Pacific Islands (vote) 13. The European private company statute (vote) That concludes the vote.", "ro": "Omologarea vehiculelor cu motor cu privire la câmpul de vizibilitate înainte al conducătorului auto (vot) 3. Acordul CE-Coreea de cooperare ştiinţifică şi tehnologică (vot) 4. Integrarea durabilităţii în politicile de cooperare pentru dezvoltare (vot) 5. Proiect de buget rectificativ nr.1/2007 (vot) 7. Acordul de parteneriat în domeniul pescuitului între Comunitatea Europeană şi Republica Gaboneză (vot) 10. Termenele de prescripţie aplicabile în cadrul litigiilor transfrontaliere cu privire la vătămările corporale şi accidentele mortale (vot) 12. Relaţiile UE cu insulele din Pacific: Strategie pentru un parteneriat consolidat (vot) 13. Statutul societăţii private europene (vot) Damit ist die Abstimmungsstunde beendet." } }
+{ "translation": { "en": "Corrections to votes and voting intentions: see Minutes Assignment conferred on a Member: see Minutes Membership of committees and delegations: see Minutes Decisions concerning certain documents: see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes", "ro": "Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Misiune încredinţată unui deputat: consultaţi procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Written statements for entry", "ro": "Declaraţii scrise înscrise" } }
--- a/tests/fixtures/tests_samples/wmt_en_ro/val.json
+++ b/tests/fixtures/tests_samples/wmt_en_ro/val.json
@@ -0,0 +1,16 @@
+{ "translation": { "en": "Brazil's Former Presidential Chief-of-Staff to Stand Trial A federal judge on Tuesday accepted the charges filed against Brazil's former presidential chief of staff for his alleged involvement in a massive corruption scheme at state-owned oil company Petrobras. The federal prosecutor's office said Jose Dirceu will face trial on the corruption, racketeering and money laundering charges filed earlier this month. Fourteen other people will also be tried, including Joao Vaccari Neto, the former treasurer of Brazil's governing Workers' Party and Renato de Souza Duque, Petrobras' former head of corporate services.", "ro": "Fostul șef al cabinetului prezidențial brazilian este adus în fața instanței Marți, un judecător federal a acceptat acuzațiile aduse împotriva fostului șef al cabinetului prezidențial brazilian pentru presupusa implicare a acestuia într-o schemă masivă de corupție privind compania petrolieră de stat Petrobras. Biroul procurorului federal a declarat că Jose Dirceu va fi trimis în judecată pentru acuzațiile de corupție, înșelătorie și spălare de bani aduse în această lună. Alte paisprezece persoane vor fi judecate, printre acestea numărându-se Joao Vaccari Neto, fostul trezorier al Partidului Muncitorilor, aflat la putere în Brazilia, și Renato de Souza Duque, fostul președinte al serviciilor pentru întreprinderi ale Petrobras." } }
+{ "translation": { "en": "Dirceu is the most senior member of the ruling Workers' Party to be taken into custody in connection with the scheme. Dirceu served as former President Luiz Inacio Lula da Silva's chief of staff between 2003 and 2005. He was arrested early August in his home, where he already was under house arrest serving an 11-year sentence for his involvement in a cash-for-votes scheme in Congress more than 10 years ago. Prosecutors have said that Dirceu masterminded the kickback scheme at Petrobras, accepted bribes while in office and continued to receive payments from contractors after he was jailed in late 2013 for the vote-buying scandal.", "ro": "Dirceu este cel mai vechi membru al Partidului Muncitorilor aflat la guvernare luat în custodie pentru legăturile cu această schemă. Dirceu a servit ca șef de cabinet al fostului președinte Luiz Inacio Lula da Silva între 2003 și 2005. A fost arestat la începutul lui august de acasă, unde deja se afla sub arest la domiciliu, cu o pedeapsă de 11 ani pentru implicarea într-o schemă de cumpărare a voturilor în Congres cu peste 10 ani în urmă. Procurorii au declarat că Dirceu a dezvoltat schema de luare de mită de la Petrobras, a acceptat mită în timp ce se afla în funcție și a continuat să primească plăți de la antreprenori după ce a fost închis la sfârșitul lui 2013 pentru scandalul voturilor cumpărate." } }
+{ "translation": { "en": "According to prosecutors, the scheme at Petrobras involved roughly $2 billion in bribes and other illegal funds. Some of that money was allegedly funneled back to campaign coffers of the ruling party and its allies. It also allegedly included the payment of bribes to Petrobras executives in return for inflated contracts. 'Miraculous' recovery for Peshawar massacre schoolboy A teenager paralysed after being shot four times in Pakistan's deadliest terror attack has made a \"miraculous\" recovery following treatment in the UK. Muhammad Ibrahim Khan, 13, had been told by doctors in Pakistan that he would never walk again.", "ro": "Conform procurorilor, schema de la Petrobras a implicat aproximativ 2 miliarde de dolari sub formă de mită și alte fonduri ilegale. O parte din acei bani s-ar fi întors în fondul de campanie al partidului aflat la guvernare și al aliaților acestora. De asemenea, ar fi inclus mită către directorii Petrobras în schimbul unor contracte umflate. Recuperarea „miraculoasă” a unui elev supraviețuitor al masacrului de la Peshawar Un adolescent paralizat după ce fusese împușcat de patru ori în cel mai cumplit atac terorist din Pakistan a reușit o recuperare „miraculoasă” după ce a urmat un tratament în Regatul Unit. Lui Mohamed Ibrahim Khan, în vârstă de 13 ani, doctorii din Pakistan îi spuseseră că nu va mai putea să meargă niciodată." } }
+{ "translation": { "en": "At least 140 people, mostly children, were killed when gunmen stormed Peshawar's Army Public School last December. Muhammad, who arrived in London last month for surgery, is being discharged from hospital later. Exactly nine months ago, on an ordinary Tuesday morning, Muhammad sat in his first aid class listening to his teachers intently. At the same time seven gunmen disguised in security uniforms were entering the Army Public School. They were strapped with explosives and had one simple mission in mind: Kill every man, woman and child they came across. \"I can't forget what happened that day,\" Muhammad says with a severe stare.", "ro": "Cel puțin 140 de persoane, majoritatea copii, au fost ucise când bărbați înarmați au atacat școala publică a armatei din Peshawar în luna decembrie a anului trecut. Mohamed, care a sosit la Londra luna trecută pentru operație, va fi externat mai târziu din spital. Exact cu nouă luni în urmă, într-o dimineață obișnuită de marți, Mohamed stătea la ora de primul ajutor și își asculta atent profesorii. Chiar atunci, șapte bărbați înarmați deghizați în uniformele agenților de pază intrau în școala publică a armatei. Purtau centuri cu explozivi și aveau de îndeplinit o misiune simplă: să îi ucidă pe toți bărbații, femeile și copiii care le ieșeau în cale. „Nu pot uita ce s-a întâmplat în acea zi”, spune Mohamed cu o privire aspră." } }
+{ "translation": { "en": "We were sitting in the auditorium, we were asking questions... and then we heard heavy gunfire outside. The terrorists moved inside and they started killing - our teacher was burned alive. Muhammad described pulling four other pupils out of the auditorium as the carnage unfolded. He said he then heard his friend, Hamza calling to him. He said, 'oh brother save me'. I held his hand. That's when I was shot in the back, and he was shot in the head. Most of the people killed in the attack were pupils Hamza died in Muhammad's arms. Muhammad recalled blacking out after that, and the next thing he knew he was in a hospital bed, paralysed from the waist down.", "ro": "Stăteam în amfiteatru, puneam întrebări... apoi am auzit focuri de armă afară. Teroriștii au intrat înăuntru și au început să ucidă. Profesorul nostru a fost ars de viu. Mohamed descrie cum a scos patru elevi din amfiteatru în timp ce se desfășura carnagiul. Apoi spune că și-a auzit prietenul, pe Hamza, strigându-l. Spunea „oh, frate, salvează-mă”. L-am ținut de mână. Atunci eu am fost împușcat în spate, iar el în cap. Cei mai mulți dintre cei uciși în atac erau elevi Hamza a murit în brațele lui Mohamed. Mohamed își amintește că imediat după asta a leșinat și că următorul lucru pe care l-a știut a fost că se afla pe un pat de spital, paralizat de la brâu în jos." } }
+{ "translation": { "en": "Doctors in Peshawar in northern Pakistan, and then Rawalpindi, close to the capital, told his family there was no treatment, and he would never walk again. \"Seeing him I felt like my soul had left my body,\" says Muhammad's father, Sher Khan Those nine months were the hardest in my life. But Mr Khan and his wife, Sherbano, refused to believe that their cricket-mad son would never be able to use his legs again. They campaigned, and appealed for help on Pakistani TV, gaining the support of high profile people such as cricketer turned politician Imran Khan.", "ro": "Doctorii din Peshawar din nordul Pakistanului, apoi cei din Rawalpindi, aproape de capitală, i-au spus familiei sale că nu exista tratament și că nu va mai putea merge niciodată. „Când l-am văzut, am simțit cum îmi iese sufletul”, spune Sher Khan, tatăl lui Mohamed. Acele nouă luni au fost cele mai grele din viața mea. Însă Khan și soția lui, Sherbano, au refuzat să creadă că fiul lor atât de pasionat de crichet nu-și va mai putea folosi vreodată picioarele. Au făcut o campanie și au cerut ajutor de la televiziunea pakistaneză, atrăgând sprijinul unor oameni faimoși precum Imran Khan, jucător de crichet devenit politician." } }
+{ "translation": { "en": "Finally, they were able to raise the funds to bring Muhammad to the UK and provide him with treatment at London's private Harley Street Clinic. Consultant neurosurgeon Irfan Malik described Muhammad as \"terrified\" when he first arrived at the hospital. \"He'd spent the last [few] months lying on a bed, unable to move side to side,\" says Mr Malik. He was weak, he had a pressure sore on his back. He wasn't in great shape. A vertebra at the base of Muhammad's spine was destroyed Muhammad was shot in his shoulder, his hip, and his back during the attack, damaging his lower spine - leading to paralysis.", "ro": "Într-un final, au reușit să strângă fonduri pentru a-l duce pe Mohamed în Regatul Unit și a-i oferi tratament la clinica privată Harley Street din Londra. Neurochirurgul consultant Irfan Malik l-a descris pe Mohamed drept „înspăimântat” când acesta a ajuns la spital. „Își petrecuse ultimele [câteva] luni zăcând în pat, fără să se poată mișca de pe o parte pe alta, spune Malik. Era slăbit, se pusese multă presiune pe spatele lui. Nu era într-o formă prea bună. O vertebră de la baza coloanei vertebrale a lui Mohamed fusese distrusă Mohamed fusese împușcat în umăr, în șold și în spate în timpul atacului, iar coloana vertebrală inferioară îi fusese distrusă, ducând la paralizie." } }
+{ "translation": { "en": "But during six hours of surgery, Mr Malik and his team were able to reattach nerve endings and reconstruct the damaged part of the spine. Even Mr Malik was surprised at what happened next. Exactly one week after the surgery Muhammad stood up and started taking steps and walking. We were not expecting to get that sort of excellent result. That was miraculous,\" he says. Less than two weeks after his operation, Muhammad is ready to leave hospital and start the long road to recovery. Muhammad has defied the odds and started to walk again He says he wants to build his strength and continue his education in the UK. But he says he is determined to return to Pakistan, join the army and help fight terrorism.", "ro": "Însă, în timpul unei operații care a durat șase ore, Malik și echipa lui au reușit să lege din nou terminațiile nervoase și să reconstruiască partea distrusă a coloanei. Chiar și Malik a fost surprins de ceea ce s-a întâmplat în continuare. Exact la o săptămână după operație, Mohamed s-a ridicat și a început să facă pași și să meargă. Nu ne așteptam la un rezultat atât de bun. A fost un miracol”, spune acesta. În mai puțin de două săptămâni de la operație, Mohamed este gata să părăsească spitalul și să înceapă procesul lung de recuperare. Mohamed a sfidat soarta și a început să meargă din nou Vrea să devină puternic și să își continue studiile în Regatul Unit. Însă este hotărât să revină în Pakistan, să se înroleze în armată și să lupte împotriva terorismului." } }
+{ "translation": { "en": "\"I feel like I have a second chance at life,\" he says as he shows off pictures he's drawn of guns scribbled out next to school books and pens Muhammad grows physically stronger every day but the psychological trauma he continues to endure is unimaginable. \"My anger is not diminishing\" he says. In my school little kids were killed. What was their crime? His mother, wiping a tear from her eye, caressed his head and said: \"I can see my son walking again.\" He'll be able to get on with his normal life. 'Super Voice' 4G service from Three offers better signal Three is making use of a lower frequency 4G spectrum that can travel more widely", "ro": "„Simt că am încă o șansă la viață” spune el, arătând imaginile cu arme desenate de el lângă manuale școlare și stilouri Fizic, Mohamed devine tot mai puternic în fiecare zi, însă trauma psihologică prin care trece și acum este de neimaginat. „Furia mea nu a scăzut”, mărturisește el. În școala mea au fost uciși copii mici. Ce crimă au comis ei? Mama lui își șterge o lacrimă, îl mângâie pe creștet și spune: „Îmi văd fiul mergând din nou”. Va putea să-și continue firesc viața. Serviciul 4G „Super Voice” de la Three oferă semnal mai bun Three folosește un spectru 4G cu o frecvență mai joasă, care poate acoperi o zonă mai extinsă" } }
+{ "translation": { "en": "Mobile phone provider Three has launched a UK service it says will improve reception inside buildings and in rural black spots. Its 4G Super Voice enables customers to make calls and send texts using a lower frequency spectrum. Other networks are looking into introducing the technology, known as Voice Over Long-Term Evolution (VoLTE). It currently works on only the Samsung Galaxy S5, but recent iPhone handsets will be added in the coming months. Three said up to 5.5 million customers would have access to the service by 2017.", "ro": "Furnizorul de telefonie mobilă Three a lansat în Regatul Unit un serviciu despre care spune că va îmbunătăți recepția în interiorul clădirilor și în zonele rurale fără semnal. Serviciul 4G Super Voice le permite clienților să efectueze apeluri și să trimită mesaje text folosind un spectru cu o frecvență mai joasă. Și alte rețele intenționează să introducă aceeași tehnologie, cunoscută ca „Voice Over Long-Term Evolution (VoLTE)”. Aceasta funcționează momentan doar cu Samsung Galaxy S5, însă telefoanele iPhone recente vor beneficia de ea în lunile următoare. Three menționează că până la 5,5 milioane de clienți vor avea acces la serviciu până în 2017." } }
+{ "translation": { "en": "Chief technology officer Bryn Jones said: \"By the end of the year, one million of our customers will have access to better indoor coverage and be able to use their phones in more places than ever before.\" Stars prepare for panto season Pantomime season is big business for theatres up and down the UK, with many getting ready for this year's season now. Some of the biggest names in showbusiness now take part in the yuletide theatre. Matthew Kelly and Hayley Mills will be appearing in Cinderella - one as an ugly sister, the other as fairy godmother. They reveal their panto secrets to BBC Breakfast. Steven Wilson: 'If I don't do anything, I feel this creeping guilt'", "ro": "Responsabilul șef pentru tehnologie, Bryn Jones a declarat: „Până la sfârșitul anului, un milion dintre clienții noștri vor avea acces la o acoperire mai bună în interior și își vor putea folosi telefoanele în mai multe locuri ca până acum”. Vedetele se pregătesc pentru stagiunea de pantomimă Stagiunea de pantomimă este foarte importantă pentru teatrele din tot Regatul Unit, multe dintre ele pregătindu-se acum pentru stagiunea din acest an. Acum, la teatrul de Crăciun participă unele dintre numele cele mai mari din showbusiness. Matthew Kelly și Hayley Mills vor apărea în Cenușăreasa - primul în rolul uneia dintre surorile rele, iar a doua în rolul zânei. Aceștia dezvăluie secretele pantomimei lor la BBC Breakfast. Steven Wilson: „Dacă nu fac nimic, mă simt vinovat”" } }
+{ "translation": { "en": "Steven Wilson was recently the big winner at the Progressive Music Awards Steven Wilson is often dubbed the hardest working musician in the world of progressive rock. The multi-talented musician won three prizes at this month's Progressive Music Awards in London, including album of the year for Hand. The Guardian's five-star review called it \"a smart, soulful and immersive work of art.\" Since the 1980s, Wilson has been the driving force in a number of musical projects, the best known of which is the rock band Porcupine Tree. Now, ahead of two sell-out shows at the Royal Albert Hall, Wilson is releasing a vinyl-only double LP, Transience, to showcase the \"more accessible\" side of his solo output.", "ro": "Steven Wilson a fost desemnat recent drept marele câștigător al Progressive Music Awards Steven Wilson a fost numit de multe ori drept cel mai muncitor muzician din lumea rockului progresiv. Talentatul muzician a câștigat trei premii la Progressive Music Awards, care a avut loc luna aceasta la Londra, printre care și premiul pentru cel mai bun album al anului pentru Hand. În recenzia sa de cinci stele, The Guardian a numit albumul „o operă de artă inteligentă, expresivă și captivantă”. Încă din anii 1980, Wilson este motorul mai multor proiecte muzicale, cel mai cunoscut dintre acestea fiind trupa de rock Porcupine Tree. Acum, înainte de două spectacole cu casa închisă la Royal Albert Hall, Wilson lansează un dublu LP doar în format vinil, Transience, pentru a arăta latura „mai accesibilă” a activității sale solo." } }
+{ "translation": { "en": "He tells the BBC about his love of vinyl, his busy schedule and explains how comic actor Matt Berry came to be his support act. What does vinyl mean to you? I grew up at the very tail end of the vinyl era, and at the time, I remember, we couldn't wait for CD to come along because vinyl was so frustrating. You would buy the record, take it home, and it would have a scratch, and you would have to take it back again. I love CDs, and for some kinds of music - classical for example - it is better than vinyl. But the problem with the CD and digital downloads is that there's nothing you can really cherish or treasure. Owning vinyl is like having a beautiful painting hanging in your living room.", "ro": "A povestit pentru BBC despre dragostea lui pentru viniluri și despre programul său încărcat și a explicat cum a ajuns actorul de comedie Matt Berry să îi deschidă spectacolele. Ce înseamnă vinil pentru tine? Am crescut chiar în perioada de sfârșit a erei vinilurilor și îmi amintesc că atunci abia așteptam apariția CD-ului, căci vinilul era atât de enervant. Cumpărai un disc, mergeai cu el acasă, avea o zgârietură și trebuia să îl aduci înapoi. Iubesc CD-urile, iar pentru anumite tipuri de muzică, de exemplu cea clasică, sunt mai bune decât vinilurile. Însă problema cu CD-urile și cu descărcările digitale este aceea că nu mai există nimic pe care să îl prețuiești cu adevărat. Să ai un vinil e ca și cum ai avea un tablou frumos agățat în sufragerie." } }
+{ "translation": { "en": "It's something you can hold, pore over the lyrics and immerse yourself in the art work. I thought it was just a nostalgic thing, but it can't be if kids too young to remember vinyl are enjoying that kind of experience. Do you have a piece of vinyl that you treasure? The truth is I got rid of 100% of my vinyl in the 90s. All the vinyl I have is re-bought. I started off from the perspective that I wanted to recreate the collection I had when I was 15, but it's gone beyond that. The first record which I persuaded my parents to buy for me was Electric Light Orchestra's Out of the Blue.", "ro": "E ceva ce poți ține în mână, în timp ce te lași absorbit de versuri și copleșit de actul artistic. Am crezut că e doar o chestie nostalgică, însă nu are cum să fie așa dacă unor puști prea tineri să-și amintească de viniluri le place acest gen de experiență. Ai vreun vinil la care ții în mod special? Recunosc că am scăpat de toate vinilurile în anii '90. Toate vinilurile pe care le am sunt cumpărate din nou. Am pornit de la ideea de a reface colecția pe care o aveam la 15 ani, însă am trecut de limita aceea. Primul disc pe care mi-am convins părinții să mi-l cumpere a fost Out of the Blue de la Electric Light Orchestra." } }
+{ "translation": { "en": "If I still had my original copy, it would have sentimental value, but, alas, it's in a charity shop somewhere. Steven Wilson hopes the album will be a doorway for potential new fans Why release your new compilation Transience on vinyl? It was originally conceived as an idea for Record Store Day, but we missed the boat on that. My record company had suggested I put together some of my shorter, more accessible songs. I got a bit obsessed by the idea to make something like \"an introduction to Steven Wilson,\" and I was committed to it being a vinyl-only release. Anyone who buys the vinyl does also get a high-resolution download.", "ro": "Dacă aș mai fi avut încă exemplarul inițial, acesta ar fi avut valoare sentimentală, însă, din păcate, se află pe undeva printr-un magazin de caritate. Steven Wilson speră că albumul va fi o poartă către posibili fani noi De ce ți-ai lansat noua compilație Transience pe vinil? Aceasta a fost concepută inițial ca idee pentru Ziua magazinelor de discuri, însă am ratat ocazia. Casa mea de discuri sugerase să adun câteva dintre melodiile mele mai scurte și mai accesibile. Am ajuns să fiu ușor obsedat de ideea de a face ceva gen „introducere în muzica lui Steven Wilson” și am ținut neapărat ca proiectul să fie lansat doar pe vinil. Cine cumpără vinilul primește, de asemenea, și o variantă descărcată la rezoluție înaltă." } }
+{ "translation": { "en": "Do you have a concern that the album won't show your work in a true light?", "ro": "Ești îngrijorat că albumul nu va arăta muzica ta în adevărata ei lumină?" } }
--- a/tests/fixtures/tests_samples/xsum/sample.json
+++ b/tests/fixtures/tests_samples/xsum/sample.json
--- a/tests/fixtures/vibevoice/expected_acoustic_tokenizer_results.json
+++ b/tests/fixtures/vibevoice/expected_acoustic_tokenizer_results.json
--- a/tests/fixtures/vibevoice_asr/expected_results_batch.json
+++ b/tests/fixtures/vibevoice_asr/expected_results_batch.json
@@ -0,0 +1 @@
+{"transcriptions": ["assistant\n[{\"Start\":0,\"End\":5.86,\"Speaker\":0,\"Content\":\"Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.\"}]\n", "assistant\n[{\"Start\":0,\"End\":4.82,\"Speaker\":0,\"Content\":\"Nor is Mr. Quilter's manner less interesting than his matter.\"}]\n"], "input_ids": [[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 429, 1356, 55136, 7699, 1946, 1119, 1467, 2550, 304, 4718, 3561, 13, 151645, 198, 151644, 872, 198, 151646, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151647, 198, 1986, 374, 264, 220, 20, 13, 23, 21, 6486, 7699, 11, 4486, 1356, 3114, 432, 448, 1493, 6894, 25, 5145, 882, 11, 3972, 882, 11, 29073, 3034, 11, 8883, 151645, 198], [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 429, 1356, 55136, 7699, 1946, 1119, 1467, 2550, 304, 4718, 3561, 13, 151645, 198, 151644, 872, 198, 151646, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151647, 198, 1986, 374, 264, 220, 19, 13, 23, 17, 6486, 7699, 11, 4486, 1356, 3114, 432, 448, 1493, 6894, 25, 5145, 882, 11, 3972, 882, 11, 29073, 3034, 11, 8883, 151645, 198]], "generated_ids": [[151644, 77091, 198, 58, 4913, 3479, 788, 15, 1335, 3727, 788, 20, 13, 23, 21, 1335, 82036, 788, 15, 1335, 2762, 3252, 12275, 13, 3406, 2044, 374, 279, 38471, 273, 315, 279, 6149, 6846, 11, 323, 582, 525, 15713, 311, 10565, 806, 41482, 1189, 25439, 151645, 198, 151643], [151644, 77091, 198, 58, 4913, 3479, 788, 15, 1335, 3727, 788, 19, 13, 23, 17, 1335, 82036, 788, 15, 1335, 2762, 3252, 32663, 374, 4392, 13, 3406, 2044, 594, 11566, 2686, 7040, 1091, 806, 4925, 1189, 25439, 151645, 198, 151643]]}
--- a/tests/fixtures/vibevoice_asr/expected_results_single.json
+++ b/tests/fixtures/vibevoice_asr/expected_results_single.json
@@ -0,0 +1 @@
+{"transcriptions": ["assistant\n[{\"Start\":0,\"End\":5.86,\"Speaker\":0,\"Content\":\"Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.\"}]\n"], "input_ids": [[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 429, 1356, 55136, 7699, 1946, 1119, 1467, 2550, 304, 4718, 3561, 13, 151645, 198, 151644, 872, 198, 151646, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151647, 198, 1986, 374, 264, 220, 20, 13, 23, 21, 6486, 7699, 11, 4486, 1356, 3114, 432, 448, 1493, 6894, 25, 5145, 882, 11, 3972, 882, 11, 29073, 3034, 11, 8883, 151645, 198]], "generated_ids": [[151644, 77091, 198, 58, 4913, 3479, 788, 15, 1335, 3727, 788, 20, 13, 23, 21, 1335, 82036, 788, 15, 1335, 2762, 3252, 12275, 13, 3406, 2044, 374, 279, 38471, 273, 315, 279, 6149, 6846, 11, 323, 582, 525, 15713, 311, 10565, 806, 41482, 1189, 25439, 151645, 198, 151643]]}
--- a/tests/fixtures/vibevoice_asr/expected_results_with_context.json
+++ b/tests/fixtures/vibevoice_asr/expected_results_with_context.json
@@ -0,0 +1 @@
+{"audio_url": "https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav", "context_info": "About VibeVoice", "transcriptions": ["assistant\n[{\"Start\":0.0,\"End\":7.56,\"Speaker\":0,\"Content\":\"VibeVoice is this novel framework designed for generating expressive, long-form, multi-speaker, conversational audio.\"}]\n"], "input_ids": [[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 429, 1356, 55136, 7699, 1946, 1119, 1467, 2550, 304, 4718, 3561, 13, 151645, 198, 151644, 872, 198, 151646, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151647, 198, 1986, 374, 264, 220, 22, 13, 20, 21, 6486, 7699, 11, 448, 4960, 3546, 25, 9975, 647, 23549, 51167, 271, 5501, 1356, 3114, 432, 448, 1493, 6894, 25, 5145, 882, 11, 3972, 882, 11, 29073, 3034, 11, 8883, 151645, 198]], "generated_ids": [[151644, 77091, 198, 58, 4913, 3479, 788, 15, 13, 15, 1335, 3727, 788, 22, 13, 20, 21, 1335, 82036, 788, 15, 1335, 2762, 3252, 53, 23549, 51167, 374, 419, 11514, 12626, 6188, 369, 23163, 77123, 11, 1293, 8460, 11, 7299, 52975, 4407, 11, 7517, 1663, 7699, 1189, 25439, 151645, 198, 151643]]}
--- a/tests/fixtures/vocab.json
+++ b/tests/fixtures/vocab.json
@@ -0,0 +1 @@
+{"l": 0, "o": 1, "w": 2, "e": 3, "r": 4, "s": 5, "t": 6, "i": 7, "d": 8, "n": 9, "Ġ": 10, "Ġl": 11, "Ġn": 12, "Ġlo": 13, "Ġlow": 14, "er": 15, "Ġlowest": 16, "Ġnewer": 17, "Ġwider": 18, "<unk>": 19, "<|endoftext|>": 20}
--- a/tests/fixtures/vocab.txt
+++ b/tests/fixtures/vocab.txt
@@ -0,0 +1,10 @@
+[PAD]
+[SEP]
+[MASK]
+[CLS]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
--- a/tests/fixtures/xcodec/integration_tests.json
+++ b/tests/fixtures/xcodec/integration_tests.json
--- a/tests/generation/init.py
+++ b/tests/generation/init.py
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -0,0 +1,338 @@
+import gc
+import unittest
+import weakref
+from unittest.mock import MagicMock
+
+import torch
+
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
+from transformers.generation.candidate_generator import (
+    AssistantToTargetTranslator,
+    AssistantVocabTranslatorCache,
+    UniversalSpeculativeDecodingGenerator,
+)
+from transformers.testing_utils import require_torch, torch_device
+
+
+@require_torch
+class TestAssistantToTargetTranslator(unittest.TestCase):
+    def setUp(self):
+        # Create mock tokenizers with predefined vocabularies
+        self.target_tokenizer = MagicMock()
+        self.assistant_tokenizer = MagicMock()
+        self.assistant_model = MagicMock(device=torch_device)
+
+        # Define mock vocabularies for the tokenizers
+        self.target_vocab = {"hello": 0, "world": 1, "foo": 2, "bar": 3}
+        self.assistant_vocab = {"hello": 0, "world": 1, "foo": 2, "baz": 4}
+
+        self.target_tokenizer.get_vocab.return_value = self.target_vocab
+        self.assistant_tokenizer.get_vocab.return_value = self.assistant_vocab
+        self.target_vocab_size = 6
+
+        # Instantiate the class under test
+        self.translator = AssistantToTargetTranslator(
+            target_tokenizer=self.target_tokenizer,
+            assistant_tokenizer=self.assistant_tokenizer,
+            target_vocab_size=self.target_vocab_size,
+            assistant_model=self.assistant_model,
+            assistant_prune_lm_head=False,
+        )
+
+    def test_get_assistant_to_target_input_ids(self):
+        """Test the mapping from assistant tokens to target tokens."""
+        expected_mapping = [0, 1, 2, self.translator.SUPPRESS_TOKEN_ID, self.translator.SUPPRESS_TOKEN_ID]
+        actual_mapping = self.translator._assistant_to_target_input_ids.tolist()
+        self.assertEqual(actual_mapping, expected_mapping)
+
+    def test_get_suppress_input_ids(self):
+        """Test the suppression of assistant input IDs not present in the target vocabulary."""
+        expected_suppress_ids = [3, 4]
+        actual_suppress_ids = self.translator._get_suppress_input_ids().tolist()
+        self.assertEqual(actual_suppress_ids, expected_suppress_ids)
+
+    def test_get_target_ids(self):
+        """Test the translation of assistant candidate IDs to target candidate IDs."""
+        assistant_input_ids = torch.LongTensor([[0, 1, 2]]).to(
+            self.assistant_model.device
+        )  # 'hello world foo' in assistant tokenizer
+        target_input_ids = torch.LongTensor([[0, 1, 2]]).to(
+            self.assistant_model.device
+        )  # 'hello world foo' in target tokenizer
+        assistant_candidate_ids = torch.LongTensor([[0, 1, 2, 4]]).to(
+            self.assistant_model.device
+        )  # 'hello world foo baz' in assistant tokenizer
+
+        expected_target_ids = torch.LongTensor(
+            [[0, 1, 2, self.translator.SUPPRESS_TOKEN_ID]]
+        ).to(
+            self.assistant_model.device
+        )  # 'hello world foo baz' in target tokenizer (baz is mapped to self.translator.suppress_tokens_id since it does not exist in target vocab)
+
+        actual_target_ids = self.translator.get_target_ids(
+            assistant_input_ids, target_input_ids, assistant_candidate_ids
+        )
+        self.assertTrue(torch.equal(actual_target_ids, expected_target_ids))
+
+    def test_get_target_logits(self):
+        """Test the conversion of assistant logits to target logits."""
+        # Assistant logits for IDs 0, 1, 2
+        assistant_logits = torch.FloatTensor([[[0.1, 0.2, 0.3, 0.4, self.translator.FILTER_VALUE]]]).to(
+            self.assistant_model.device
+        )  # Shape (1, 1, 5)
+
+        # Expected target logits (target_vocab_size = 4)
+        expected_target_logits = torch.full((1, 1, self.target_vocab_size), self.translator.FILTER_VALUE).to(
+            self.assistant_model.device
+        )
+        expected_target_logits[0, 0, 0] = 0.1  # 'hello'
+        expected_target_logits[0, 0, 1] = 0.2  # 'world'
+        expected_target_logits[0, 0, 2] = 0.3  # 'foo'
+        # The 'bar' token in target vocab remains at -inf
+
+        actual_target_logits = self.translator.get_target_logits(assistant_logits)
+        self.assertTrue(torch.equal(actual_target_logits, expected_target_logits))
+
+
+class MockTokenizer:
+    """A simple mock tokenizer class that supports weak references."""
+
+    def __init__(self, vocab=None):
+        self._vocab = vocab or {}
+
+    def get_vocab(self):
+        return self._vocab
+
+    def __call__(self, text, add_special_tokens=True):
+        # Mock implementation of the __call__ method
+        tokens = text.split()
+        input_ids = [self._vocab.get(token, 0) for token in tokens]
+        return {"input_ids": input_ids}
+
+
+@require_torch
+class TestAssistantVocabTranslatorCache(unittest.TestCase):
+    def setUp(self):
+        # Clear the cache before each test
+        AssistantVocabTranslatorCache._cache.clear()
+        # Create mock tokenizers with different vocabularies
+        self.target_tokenizer = MockTokenizer({"hello": 0, "world": 1})
+        self.assistant_tokenizer = MockTokenizer({"hello": 0, "world": 1, "foo": 2})
+        self.other_target_tokenizer = MockTokenizer({"foo": 2, "bar": 3})
+        self.other_assistant_tokenizer = MockTokenizer({"baz": 4, "qux": 5})
+        self.assistant_model = MagicMock(device=torch_device)
+
+        self.target_vocab_size = 6
+
+    def test_same_instance_for_same_tokenizers(self):
+        """Test that the same translator is returned for the same tokenizers."""
+        translator1 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            target_vocab_size=self.target_vocab_size,
+            assistant_model=self.assistant_model,
+            assistant_prune_lm_head=False,
+        )
+        translator2 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            target_vocab_size=self.target_vocab_size,
+            assistant_model=self.assistant_model,
+            assistant_prune_lm_head=False,
+        )
+        self.assertIs(translator1, translator2, "Translators should be cached and identical")
+
+    def test_different_instances_for_different_tokenizers(self):
+        """Test that different tokenizers produce different translators."""
+        translator1 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            target_vocab_size=self.target_vocab_size,
+            assistant_model=self.assistant_model,
+            assistant_prune_lm_head=False,
+        )
+        translator2 = AssistantVocabTranslatorCache.get_translator(
+            self.other_target_tokenizer,
+            self.other_assistant_tokenizer,
+            target_vocab_size=self.target_vocab_size,
+            assistant_model=self.assistant_model,
+            assistant_prune_lm_head=False,
+        )
+        self.assertIsNot(translator1, translator2, "Translators should differ for different tokenizers")
+
+    def test_cache_with_weakref_key(self):
+        """Ensure that the cache uses weak references as keys."""
+        initial_cache_size = len(AssistantVocabTranslatorCache._cache)
+        target_tokenizer = MockTokenizer({"hello": 0})
+        assistant_tokenizer = MockTokenizer({"hello": 0})
+
+        # Store translator in a local variable to avoid it being kept alive
+        translator = AssistantVocabTranslatorCache.get_translator(
+            target_tokenizer,
+            assistant_tokenizer,
+            target_vocab_size=self.target_vocab_size,
+            assistant_model=self.assistant_model,
+            assistant_prune_lm_head=False,
+        )
+        self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1)
+
+        # Delete all strong references
+        del target_tokenizer
+        del assistant_tokenizer
+        del translator
+
+        # Force garbage collection
+        gc.collect()
+
+        # Call cleanup to remove dead entries
+        AssistantVocabTranslatorCache.cleanup()
+
+        # The cache size remains increased due to strong references
+        self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1)
+
+    def test_weakref_cache_cleanup(self):
+        """Test that the cache cleans up translators when tokenizers are garbage collected."""
+
+        def create_translator():
+            target_tokenizer = MockTokenizer({"hello": 0})
+            assistant_tokenizer = MockTokenizer({"hello": 0})
+            translator = AssistantVocabTranslatorCache.get_translator(
+                target_tokenizer,
+                assistant_tokenizer,
+                target_vocab_size=self.target_vocab_size,
+                assistant_model=self.assistant_model,
+                assistant_prune_lm_head=False,
+            )
+            # Create weak references before returning
+            refs = (weakref.ref(translator), weakref.ref(target_tokenizer), weakref.ref(assistant_tokenizer))
+            # Remove strong references inside the function
+            del target_tokenizer
+            del assistant_tokenizer
+            del translator
+            return refs
+
+        translator_ref, target_ref, assistant_ref = create_translator()
+
+        # Force garbage collection
+        gc.collect()
+
+        # Call cleanup to remove dead entries
+        AssistantVocabTranslatorCache.cleanup()
+
+        # The tokenizers and translator are not garbage collected due to strong references
+        self.assertIsNotNone(target_ref(), "Target tokenizer should still be alive due to strong references")
+        self.assertIsNotNone(assistant_ref(), "Assistant tokenizer should still be alive due to strong references")
+        self.assertIsNotNone(translator_ref(), "Translator should still be alive due to strong references")
+
+
+@require_torch
+class TestUniversalSpeculativeDecoding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.target_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
+        cls.assistant_name = "hf-internal-testing/tiny-random-PhiForCausalLM"
+
+    def setUp(self):
+        self.target_tokenizer = AutoTokenizer.from_pretrained(self.target_name)
+        self.target_config = AutoConfig.from_pretrained(self.target_name)
+        self.assistant_model = AutoModelForCausalLM.from_pretrained(self.assistant_name).to(torch_device)
+        self.assistant_tokenizer = AutoTokenizer.from_pretrained(self.assistant_name)
+        self.generation_config = GenerationConfig(max_length=20, min_length=0)
+
+        # Ensure required tokens exist
+        if self.target_tokenizer.pad_token_id is None:
+            self.target_tokenizer.pad_token_id = self.target_tokenizer.eos_token_id
+        if self.target_tokenizer.bos_token_id is None:
+            self.target_tokenizer.bos_token_id = self.target_tokenizer.eos_token_id
+        if self.assistant_tokenizer.pad_token_id is None:
+            self.assistant_tokenizer.pad_token_id = self.assistant_tokenizer.eos_token_id
+        if self.assistant_tokenizer.bos_token_id is None:
+            self.assistant_tokenizer.bos_token_id = self.assistant_tokenizer.eos_token_id
+
+        self.input_ids = torch.tensor([[1, 2, 3]]).to(torch_device)
+        self.model_kwargs = {
+            "attention_mask": torch.ones_like(self.input_ids).to(torch_device),
+        }
+        atm_translator = AssistantVocabTranslatorCache.get_translator(
+            target_tokenizer=self.target_tokenizer,
+            assistant_tokenizer=self.assistant_tokenizer,
+            assistant_model=self.assistant_model,
+            target_vocab_size=self.target_config.vocab_size,
+        )
+        self.generator = UniversalSpeculativeDecodingGenerator(
+            input_ids=self.input_ids,
+            assistant_model=self.assistant_model,
+            target_tokenizer=self.target_tokenizer,
+            assistant_tokenizer=self.assistant_tokenizer,
+            generation_config=self.generation_config,
+            model_kwargs=self.model_kwargs,
+            atm_translator=atm_translator,
+        )
+
+    def test_basic_generation(self):
+        """Test basic speculative decoding works"""
+        input_text = "The quick brown fox"
+        input_ids = self.target_tokenizer.encode(input_text, return_tensors="pt")
+        self.generator.input_ids = input_ids
+        candidates, scores = self.generator.get_candidates(input_ids)
+
+        self.assertIsNotNone(candidates)
+        self.assertIsNotNone(scores)
+        self.assertTrue(torch.is_tensor(candidates))
+        self.assertTrue(torch.is_tensor(scores))
+
+    def test_mismatched_vocabularies(self):
+        """Test handling of mismatched vocabularies between models"""
+        # Create input with tokens present in main but not assistant vocab
+        # Find a token that is not in the assistant tokenizer but in
+        # the main tokenizer.
+        missing_token = next(
+            token
+            for token in self.target_tokenizer.get_vocab()
+            if token not in self.assistant_tokenizer.get_vocab()
+            and token not in self.target_tokenizer.all_special_tokens
+            and "reserved_" not in token
+        )
+        input_ids = torch.tensor([[self.target_tokenizer.convert_tokens_to_ids(missing_token)]])
+        self.generator.input_ids = input_ids
+        candidates, _ = self.generator.get_candidates(input_ids)
+        self.assertIsNotNone(candidates)
+
+    def test_speculation_depth(self):
+        """Test different speculation depths"""
+        input_ids = self.target_tokenizer.encode("Test text", return_tensors="pt")
+        self.generator.input_ids = input_ids
+
+        for depth in [1, 8, 17]:
+            self.generator.num_assistant_tokens = depth
+            candidates, _ = self.generator.get_candidates(input_ids)
+            self.assertLessEqual(candidates.shape[1] - input_ids.shape[1], depth)
+
+    def test_device_consistency(self):
+        """Test handling of inputs on different devices"""
+        input_ids = torch.tensor([[1, 2, 3]]).to(torch_device)
+        self.generator.input_ids = input_ids
+        candidates, _ = self.generator.get_candidates(input_ids)
+        self.assertEqual(candidates.device, input_ids.device)
+
+    def test_usd_vs_vanilla_sampling(cls):
+        """Test that USD matches vanilla sampling with temperature set to nearly 0"""
+        prompt = "Test text"
+
+        pipe_vanilla = pipeline(
+            "text-generation",
+            model=cls.target_name,
+        )
+        pipe_vanilla_output = pipe_vanilla(prompt, max_new_tokens=5, do_sample=False)
+        vanilla_text = pipe_vanilla_output[0]["generated_text"]
+
+        pipe_usd = pipeline(
+            "text-generation",
+            model=cls.target_name,
+            assistant_model=cls.assistant_name,
+        )
+        pipe_usd_output = pipe_usd(prompt, max_new_tokens=5, do_sample=True, temperature=1e-9)  # Nearly 0 temperature
+        usd_text = pipe_usd_output[0]["generated_text"]
+
+        # Assert that the outputs match
+        cls.assertEqual(usd_text, vanilla_text)
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -0,0 +1,843 @@
+# Copyright 2022 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+import os
+import tempfile
+import unittest
+import warnings
+
+from huggingface_hub import create_pull_request
+from parameterized import parameterized
+
+from transformers import AutoConfig, GenerationConfig, WatermarkingConfig, is_torch_available
+from transformers import logging as transformers_logging
+
+
+if is_torch_available():
+    import torch
+
+from transformers.generation import (
+    ClassifierFreeGuidanceLogitsProcessor,
+    EncoderNoRepeatNGramLogitsProcessor,
+    EncoderRepetitionPenaltyLogitsProcessor,
+    EpsilonLogitsWarper,
+    EtaLogitsWarper,
+    ExponentialDecayLengthPenalty,
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    GenerationMode,
+    MinLengthLogitsProcessor,
+    MinNewTokensLengthLogitsProcessor,
+    MinPLogitsWarper,
+    NoBadWordsLogitsProcessor,
+    NoRepeatNGramLogitsProcessor,
+    PrefixConstrainedLogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    SequenceBiasLogitsProcessor,
+    SuppressTokensAtBeginLogitsProcessor,
+    SuppressTokensLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+    TypicalLogitsWarper,
+    UnbatchedClassifierFreeGuidanceLogitsProcessor,
+    WatermarkLogitsProcessor,
+)
+from transformers.testing_utils import (
+    TOKEN,
+    CaptureLogger,
+    LoggingLevel,
+    TemporaryHubRepo,
+    is_staging_test,
+    torch_device,
+)
+
+
+class GenerationConfigTest(unittest.TestCase):
+    @parameterized.expand([(None,), ("foo.json",)])
+    def test_save_load_config(self, config_name):
+        config = GenerationConfig(
+            do_sample=True,
+            temperature=0.7,
+            length_penalty=1.0,
+            bad_words_ids=[[1, 2, 3], [4, 5]],
+        )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir, config_name=config_name)
+            loaded_config = GenerationConfig.from_pretrained(tmp_dir, config_name=config_name)
+
+        # Checks parameters that were specified
+        self.assertEqual(loaded_config.do_sample, True)
+        self.assertEqual(loaded_config.temperature, 0.7)
+        self.assertEqual(loaded_config.length_penalty, 1.0)
+        self.assertEqual(loaded_config.bad_words_ids, [[1, 2, 3], [4, 5]])
+
+        # Checks parameters that were not specified (defaults)
+        self.assertEqual(loaded_config.top_k, None)
+        self.assertEqual(loaded_config.max_length, None)
+        self.assertEqual(loaded_config.max_time, None)
+
+    def test_from_model_config(self):
+        model_config = AutoConfig.from_pretrained("openai-community/gpt2")
+        generation_config_from_model = GenerationConfig.from_model_config(model_config)
+        default_generation_config = GenerationConfig()
+
+        # The generation config has loaded a few non-default parameters from the model config
+        self.assertNotEqual(generation_config_from_model, default_generation_config)
+
+        # One of those parameters is eos_token_id -- check if it matches
+        self.assertNotEqual(generation_config_from_model.eos_token_id, default_generation_config.eos_token_id)
+        self.assertEqual(generation_config_from_model.eos_token_id, model_config.eos_token_id)
+
+    def test_update(self):
+        generation_config = GenerationConfig()
+        update_kwargs = {
+            "max_new_tokens": 1024,
+            "foo": "bar",
+        }
+        update_kwargs_copy = copy.deepcopy(update_kwargs)
+        unused_kwargs = generation_config.update(**update_kwargs)
+
+        # update_kwargs was not modified (no side effects)
+        self.assertEqual(update_kwargs, update_kwargs_copy)
+
+        # update_kwargs was used to update the config on valid attributes
+        self.assertEqual(generation_config.max_new_tokens, 1024)
+
+        # `.update()` returns a dictionary of unused kwargs
+        self.assertEqual(unused_kwargs, {"foo": "bar"})
+
+    def test_kwarg_init(self):
+        """Tests that we can overwrite attributes at `from_pretrained` time."""
+        default_config = GenerationConfig()
+        self.assertEqual(default_config.temperature, None)
+        self.assertEqual(default_config.do_sample, None)
+        self.assertEqual(default_config.num_beams, None)
+
+        config = GenerationConfig(
+            do_sample=True,
+            temperature=0.7,
+            length_penalty=1.0,
+            bad_words_ids=[[1, 2, 3], [4, 5]],
+        )
+        self.assertEqual(config.temperature, 0.7)
+        self.assertEqual(config.do_sample, True)
+        self.assertEqual(config.num_beams, None)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir)
+            loaded_config = GenerationConfig.from_pretrained(tmp_dir, temperature=1.0)
+
+        self.assertEqual(loaded_config.temperature, 1.0)
+        self.assertEqual(loaded_config.do_sample, True)
+        self.assertEqual(loaded_config.num_beams, None)  # default value
+
+    def test_validate(self):
+        """
+        Tests that the `validate` method is working as expected. Note that `validate` is called at initialization time
+        """
+        logger = transformers_logging.get_logger("transformers.generation.configuration_utils")
+
+        # A correct configuration will not throw any warning
+        logger.warning_once.cache_clear()
+        with CaptureLogger(logger) as captured_logs:
+            GenerationConfig()
+        self.assertEqual(len(captured_logs.out), 0)
+
+        # Inconsequent but technically wrong configuration will throw a warning (e.g. requesting an extra output
+        # without `return_dict_in_generate=True`). May be escalated to an error in the future.
+        logger.warning_once.cache_clear()
+        with CaptureLogger(logger) as captured_logs:
+            GenerationConfig(return_dict_in_generate=False, output_scores=True)
+        self.assertNotEqual(len(captured_logs.out), 0)
+
+        # Explicitly setting a sampling flag alongside `do_sample=False` still warns: this is a user-level mistake.
+        logger.warning_once.cache_clear()
+        with CaptureLogger(logger) as captured_logs:
+            generation_config_bad_temperature = GenerationConfig(do_sample=False, temperature=0.5)  # store for later
+        self.assertNotEqual(len(captured_logs.out), 0)
+
+        # But a value inherited from a model's default config (i.e. not in this update's kwargs) does NOT warn: in
+        # the real world, `generate(do_sample=False)` on a model whose `generation_config.json` has `temperature=0.6`
+        # would otherwise log a useless warning.
+        logger.warning_once.cache_clear()
+        base_config = GenerationConfig(do_sample=True, temperature=0.6)  # mimics a model's default config
+        with CaptureLogger(logger) as captured_logs:
+            base_config.update(do_sample=False)
+        self.assertEqual(len(captured_logs.out), 0)
+
+        # Inverse provenance case: `do_sample=False` inherited from a model's config (so not user-set this call), user only
+        # sets a sampling flag. The conflict SHOULD produce noise because the user may think that it's non-greedy by default
+        logger.warning_once.cache_clear()
+        greedy_hub_config = GenerationConfig(do_sample=False)  # mimics a model's default config forcing greedy
+        with CaptureLogger(logger) as captured_logs:
+            greedy_hub_config.update(top_p=0.8)
+        self.assertNotEqual(len(captured_logs.out), 0)
+
+        # Updating only `temperature` (do_sample was pre-existing, i.e. "from the hub") does warn
+        logger.warning_once.cache_clear()
+        with CaptureLogger(logger) as captured_logs:
+            generation_config_bad_temperature.update(temperature=0.9)
+        self.assertNotEqual(len(captured_logs.out), 0)
+
+        # But setting both in the same `update()` call DOES warn.
+        logger.warning_once.cache_clear()
+        with CaptureLogger(logger) as captured_logs:
+            generation_config_bad_temperature.update(do_sample=False, temperature=0.9)
+        self.assertNotEqual(len(captured_logs.out), 0)
+
+        logger.warning_once.cache_clear()
+        with CaptureLogger(logger) as captured_logs:
+            # OK - None means it is unset, nothing to warn about
+            generation_config_bad_temperature.update(temperature=None)
+        self.assertEqual(len(captured_logs.out), 0)
+
+        # Impossible sets of parameters will raise an exception
+        with self.assertRaises(ValueError):
+            GenerationConfig(do_sample=False, num_beams=1, num_return_sequences=2)
+
+        # Passing `generate()`-only flags to `validate` will raise an exception
+        with self.assertRaises(ValueError):
+            GenerationConfig(logits_processor="foo")
+
+        # Model-specific parameters will NOT raise an exception or a warning
+        logger.warning_once.cache_clear()
+        with CaptureLogger(logger) as captured_logs:
+            GenerationConfig(foo="bar")
+        self.assertEqual(len(captured_logs.out), 0)
+
+        # By default we throw a short warning. However, we log with INFO level the details.
+        # Default: we don't log the incorrect input values, only a short summary. We explain how to get more details.
+        logger.warning_once.cache_clear()
+        with LoggingLevel(logging.WARNING):
+            with CaptureLogger(logger) as captured_logs:
+                GenerationConfig(do_sample=False, temperature=0.5)
+        self.assertNotIn("0.5", captured_logs.out)
+        self.assertTrue(len(captured_logs.out) < 150)  # short log
+        self.assertIn("Set `TRANSFORMERS_VERBOSITY=info` for more details", captured_logs.out)
+
+        # INFO level: we share the full deets
+        logger.warning_once.cache_clear()
+        logger.info_once.cache_clear()
+        with LoggingLevel(logging.INFO):
+            with CaptureLogger(logger) as captured_logs:
+                GenerationConfig(do_sample=False, temperature=0.5)
+        self.assertIn("0.5", captured_logs.out)
+        self.assertTrue(len(captured_logs.out) > 400)  # long log
+        self.assertNotIn("Set `TRANSFORMERS_VERBOSITY=info` for more details", captured_logs.out)
+
+        # Finally, we can set `strict=True` to raise an exception on what would otherwise be a warning.
+        generation_config = GenerationConfig()
+        generation_config.temperature = 0.5
+        generation_config.do_sample = False
+        with self.assertRaises(ValueError):
+            generation_config.validate(strict=True)
+
+    def test_validate_sampling_flag_provenance(self):
+        """
+        Dedicated coverage for the provenance-aware warning rule on sampling-only flags:
+        we only warn when BOTH `do_sample=False` AND a conflicting sampling flag (e.g. `top_p`, `temperature`)
+        were explicitly provided by the caller in the same context, or none of the 2 were directly provided, or only
+        the sampling flag is provided along do_sample=False already existing.
+        """
+        logger = transformers_logging.get_logger("transformers.generation.configuration_utils")
+
+        def _warn_count(fn):
+            logger.warning_once.cache_clear()
+            with CaptureLogger(logger) as captured:
+                fn()
+            return len(captured.out)
+
+        # 1. Hub config sets `temperature`, user does only `generate(do_sample=False)` -> NO warning.
+        #    (Emulates: model whose `generation_config.json` carries `do_sample=True, temperature=0.6`, user
+        #    explicitly asks for greedy decoding.)
+        def case_hub_temp_user_do_sample_only():
+            cfg = GenerationConfig(do_sample=True, temperature=0.6)  # stands in for the hub default
+            cfg.update(do_sample=False)
+
+        self.assertEqual(_warn_count(case_hub_temp_user_do_sample_only), 0)
+
+        # 2. User explicitly sets BOTH `do_sample=False` and `top_p=0.8` in the same call -> WARN.
+        self.assertNotEqual(_warn_count(lambda: GenerationConfig(do_sample=False, top_p=0.8)), 0)
+
+        # 3. User explicitly sets only `do_sample=False` (no sampling flag) -> NO warning, even though
+        #    attribute defaults (like `top_k=50`) may be present.
+        self.assertEqual(_warn_count(lambda: GenerationConfig(do_sample=False)), 0)
+
+        # 4. Hub config forces greedy (`do_sample=False`), user sets only `top_p=0.8` -> warnings:
+        # do_sample` was inherited, but clashes with user-expressed intent, so flagging their `top_p`
+        def case_hub_greedy_user_top_p():
+            cfg = GenerationConfig(do_sample=False)  # stands in for the hub default
+            cfg.update(top_p=0.8)
+
+        self.assertNotEqual(_warn_count(case_hub_greedy_user_top_p), 0)
+
+        # 5. User sets `do_sample=False` and `temperature=0.5` via a single `update()` call -> WARN.
+        def case_update_both_sides():
+            cfg = GenerationConfig()
+            cfg.update(do_sample=False, temperature=0.5)
+
+        self.assertNotEqual(_warn_count(case_update_both_sides), 0)
+
+        # 6. Same idea for beam flags: user only asks for `num_beams=1`, hub default has `length_penalty=0.8`
+        #    -> NO warning.
+        def case_hub_length_penalty_user_num_beams_only():
+            cfg = GenerationConfig(num_beams=4, length_penalty=0.8)  # stands in for the hub default
+            cfg.update(num_beams=1)
+
+        self.assertEqual(_warn_count(case_hub_length_penalty_user_num_beams_only), 0)
+
+        # 7. User sets BOTH `num_beams=1` and `length_penalty=0.8` explicitly -> WARN.
+        self.assertNotEqual(_warn_count(lambda: GenerationConfig(num_beams=1, length_penalty=0.8)), 0)
+
+    def test_refuse_to_save(self):
+        """Tests that we refuse to save a generation config that fails validation."""
+
+        # setting the temperature alone is invalid, as we also need to set do_sample to True -> throws a warning that
+        # is caught, doesn't save, and raises an exception
+        config = GenerationConfig()
+        config.temperature = 0.5
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(ValueError) as exc:
+                config.save_pretrained(tmp_dir)
+            self.assertTrue("Fix these issues to save the configuration." in str(exc.exception))
+            self.assertTrue("`temperature` is set to `0.5`" in str(exc.exception))
+            self.assertTrue(len(os.listdir(tmp_dir)) == 0)
+
+        # greedy decoding throws an exception if we try to return multiple sequences -> throws an exception that is
+        # caught, doesn't save, and raises a warning
+        config = GenerationConfig()
+        config.num_return_sequences = 2
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(ValueError) as exc:
+                config.save_pretrained(tmp_dir)
+            self.assertTrue("Fix these issues to save the configuration." in str(exc.exception))
+            self.assertTrue(
+                "Greedy methods (do_sample != True) without beam search do not support `num_return_sequences` different than 1"
+                in str(exc.exception)
+            )
+            self.assertTrue(len(os.listdir(tmp_dir)) == 0)
+
+        # Final check: no logs at warning level/warnings/exceptions thrown if it is correct, and file is saved.
+        config = GenerationConfig()
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # Catch warnings
+            with warnings.catch_warnings(record=True) as captured_warnings:
+                # Catch logs (up to WARNING level, the default level)
+                with LoggingLevel(logging.WARNING):
+                    logger = transformers_logging.get_logger("transformers.generation.configuration_utils")
+                    with CaptureLogger(logger) as captured_logs:
+                        config.save_pretrained(tmp_dir)
+            self.assertEqual(len(captured_warnings), 0)
+            self.assertEqual(len(captured_logs.out), 0)
+            self.assertEqual(len(os.listdir(tmp_dir)), 1)
+
+    def test_generation_mode(self):
+        """Tests that the `get_generation_mode` method is working as expected."""
+        config = GenerationConfig()
+        self.assertEqual(config.get_generation_mode(), GenerationMode.GREEDY_SEARCH)
+
+        config = GenerationConfig(do_sample=True)
+        self.assertEqual(config.get_generation_mode(), GenerationMode.SAMPLE)
+
+        config = GenerationConfig(num_beams=2)
+        self.assertEqual(config.get_generation_mode(), GenerationMode.BEAM_SEARCH)
+
+        # TODO joao, manuel: remove this in v4.62.0
+        config = GenerationConfig(top_k=10, do_sample=False, penalty_alpha=0.6)
+        self.assertEqual(config.get_generation_mode(), GenerationMode.CONTRASTIVE_SEARCH)
+
+        config = GenerationConfig()
+        self.assertEqual(config.get_generation_mode(assistant_model="foo"), GenerationMode.ASSISTED_GENERATION)
+
+    def test_static_cache_without_cache_config(self):
+        """Regression test for #35026 -- static cache should work without a cache config."""
+        config = GenerationConfig(cache_implementation="static")
+        self.assertEqual(config.cache_implementation, "static")
+        self.assertEqual(config.cache_config, None)
+
+
+class GenerationConfigSerializationTest(unittest.TestCase):
+    def test_serialize_generation_sequence_bias(self):
+        """Tests that GenerationConfig is serialized and SequenceBiasLogitsProcessor is initialized with sequence_bias parameter"""
+        generation_config = GenerationConfig()
+        sequence_bias = [[[45, 67], -0.6], [[89], 1.2]]
+        generation_config.sequence_bias = sequence_bias
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertSequenceEqual(new_config.sequence_bias, sequence_bias)
+
+        expected_sequence_bias = {(45, 67): -0.6, (89,): 1.2}
+        bias_logits_processor = SequenceBiasLogitsProcessor(new_config.sequence_bias)
+        self.assertDictEqual(bias_logits_processor.sequence_bias, expected_sequence_bias)
+
+    def test_serialize_generation_min_length_eos_token(self):
+        """Tests that GenerationConfig is serialized and MinLengthLogitsProcessor is initialized with min_length and eos_token_id"""
+        eos_token_id = 0
+        min_length = 10
+
+        generation_config = GenerationConfig(min_length=min_length, eos_token_id=eos_token_id)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.min_length, min_length)
+        self.assertEqual(new_config.eos_token_id, eos_token_id)
+
+        min_dist_processor = MinLengthLogitsProcessor(
+            min_length=new_config.min_length, eos_token_id=new_config.eos_token_id
+        )
+        self.assertEqual(min_dist_processor.min_length, min_length)
+        self.assertEqual(min_dist_processor.eos_token_id, eos_token_id)
+
+    def test_serialize_generation_min_new_tokens(self):
+        """Tests that GenerationConfig is serialized and MinNewTokensLengthLogitsProcessor is initialized with min_new_tokens"""
+        eos_token_id = 0
+        min_new_tokens = 5
+        prompt_length_to_skip = 2
+
+        generation_config = GenerationConfig(min_new_tokens=min_new_tokens)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.min_new_tokens, min_new_tokens)
+
+        min_new_tokens_processor = MinNewTokensLengthLogitsProcessor(
+            prompt_length_to_skip=prompt_length_to_skip,
+            min_new_tokens=new_config.min_new_tokens,
+            eos_token_id=eos_token_id,
+        )
+        self.assertEqual(min_new_tokens_processor.min_new_tokens, min_new_tokens)
+
+    def test_serialize_generation_temperature(self):
+        """Tests that GenerationConfig is serialized and TemperatureLogitsWarper is initialized with temperature"""
+        temperature = 2.0
+
+        generation_config = GenerationConfig(temperature=temperature, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.temperature, temperature)
+
+        temperature_logits_warper = TemperatureLogitsWarper(temperature=new_config.temperature)
+        self.assertEqual(temperature_logits_warper.temperature, temperature)
+
+    def test_serialize_generation_repetition_penalty(self):
+        """Tests that GenerationConfig is serialized and RepetitionPenaltyLogitsProcessor is initialized with repetition_penalty"""
+        penalty = 2.0
+
+        generation_config = GenerationConfig(repetition_penalty=penalty)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.repetition_penalty, penalty)
+
+        rep_penalty_proc = RepetitionPenaltyLogitsProcessor(penalty=new_config.repetition_penalty)
+        self.assertEqual(rep_penalty_proc.penalty, penalty)
+
+    def test_serialize_generation_encoder_repetition_penalty(self):
+        """Tests that GenerationConfig is serialized and EncoderRepetitionPenaltyLogitsProcessor is initialized with penalty and input_ids"""
+        penalty = 2.0
+        input_ids = torch.tensor([[0, 1], [5, 0]], device=torch_device, dtype=torch.long)
+
+        generation_config = GenerationConfig(encoder_repetition_penalty=penalty)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.encoder_repetition_penalty, penalty)
+
+        rep_penalty_proc = EncoderRepetitionPenaltyLogitsProcessor(
+            penalty=new_config.encoder_repetition_penalty, encoder_input_ids=input_ids
+        )
+        self.assertEqual(rep_penalty_proc.penalty, 1 / penalty)
+        torch.testing.assert_close(rep_penalty_proc.encoder_input_ids, input_ids)
+
+    def test_serialize_generation_top_p(self):
+        """Tests that GenerationConfig is serialized and TopPLogitsWarper is initialized with top_p"""
+        top_p = 0.8
+
+        generation_config = GenerationConfig(top_p=top_p, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.top_p, top_p)
+
+        rep_penalty_proc = TopPLogitsWarper(top_p=new_config.top_p)
+        self.assertEqual(rep_penalty_proc.top_p, top_p)
+
+    def test_serialize_generation_top_k(self):
+        """Tests that GenerationConfig is serialized and TopKLogitsWarper is initialized with top_k"""
+        top_k = 2
+
+        generation_config = GenerationConfig(top_k=top_k, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.top_k, top_k)
+
+        top_k_logits_wrap = TopKLogitsWarper(top_k=new_config.top_k)
+        self.assertEqual(top_k_logits_wrap.top_k, top_k)
+
+    def test_serialize_generation_min_p(self):
+        """Tests that GenerationConfig is serialized and MinPLogitsWarper is initialized with min_p"""
+        min_p = 0.8
+
+        generation_config = GenerationConfig(min_p=min_p, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.min_p, min_p)
+
+        min_k_logits_wrap = MinPLogitsWarper(min_p=new_config.min_p)
+        self.assertEqual(min_k_logits_wrap.min_p, min_p)
+
+    def test_serialize_generation_typical_p(self):
+        """Tests that GenerationConfig is serialized and TypicalLogitsWarper is initialized with mass"""
+        mass = 0.8
+
+        generation_config = GenerationConfig(typical_p=mass, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.typical_p, mass)
+
+        typical_p_logits_wrap = TypicalLogitsWarper(mass=new_config.typical_p)
+        self.assertEqual(typical_p_logits_wrap.mass, mass)
+
+    def test_serialize_generation_epsilon_cutoff(self):
+        """Tests that GenerationConfig is serialized and EpsilonLogitsWarper is initialized with epsilon"""
+        epsilon = 0.8
+
+        generation_config = GenerationConfig(epsilon_cutoff=epsilon, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.epsilon_cutoff, epsilon)
+
+        epsilon_logits_wrap = EpsilonLogitsWarper(epsilon=new_config.epsilon_cutoff)
+        self.assertEqual(epsilon_logits_wrap.epsilon, epsilon)
+
+    def test_serialize_generation_eta_cutoff(self):
+        """Tests that GenerationConfig is serialized and EtaLogitsWarper is initialized with epsilon"""
+        epsilon = 0.8
+
+        generation_config = GenerationConfig(eta_cutoff=epsilon, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.eta_cutoff, epsilon)
+
+        eta_logits_wrap = EtaLogitsWarper(epsilon=new_config.eta_cutoff)
+        self.assertEqual(eta_logits_wrap.epsilon, epsilon)
+
+    def test_serialize_generation_ngram_size(self):
+        """Tests that GenerationConfig is serialized and NoRepeatNGramLogitsProcessor is initialized with ngram_size"""
+        ngram_size = 2
+
+        generation_config = GenerationConfig(no_repeat_ngram_size=ngram_size, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.no_repeat_ngram_size, ngram_size)
+
+        no_repeat_ngram_proc = NoRepeatNGramLogitsProcessor(ngram_size=new_config.no_repeat_ngram_size)
+        self.assertEqual(no_repeat_ngram_proc.ngram_size, ngram_size)
+
+    def test_serialize_generation_encoder_ngram_size(self):
+        """Tests that GenerationConfig is serialized and EncoderNoRepeatNGramLogitsProcessor is initialized with ngram_size"""
+        ngram_size = 2
+        input_ids = torch.tensor([[0, 1], [5, 0]], device=torch_device, dtype=torch.long)
+
+        generation_config = GenerationConfig(encoder_no_repeat_ngram_size=ngram_size, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.encoder_no_repeat_ngram_size, ngram_size)
+
+        encoder_no_repeat_ngram_proc = EncoderNoRepeatNGramLogitsProcessor(
+            encoder_ngram_size=new_config.encoder_no_repeat_ngram_size, encoder_input_ids=input_ids
+        )
+        self.assertEqual(encoder_no_repeat_ngram_proc.ngram_size, ngram_size)
+
+    def test_serialize_generation_bad_words_ids(self):
+        """Tests that GenerationConfig is serialized and NoBadWordsLogitsProcessor is initialized with bad_words_ids"""
+        bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
+
+        generation_config = GenerationConfig(bad_words_ids=bad_word_tokens)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertSequenceEqual(new_config.bad_words_ids, bad_word_tokens)
+
+        no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=new_config.bad_words_ids)
+        self.assertSequenceEqual(no_bad_words_dist_proc.bad_word_ids, bad_word_tokens)
+
+    def test_serialize_generation_num_beams(self):
+        """Tests that GenerationConfig is serialized and PrefixConstrainedLogitsProcessor is initialized with num_beams"""
+        num_beams = 1
+
+        def prefix_allowed_tokens_fn(batch_id, inputs_ids):
+            return [[0, 1], [2, 3]][batch_id]
+
+        generation_config = GenerationConfig(num_beams=num_beams)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.num_beams, num_beams)
+
+        prefix_constrained_logits_proc = PrefixConstrainedLogitsProcessor(
+            prefix_allowed_tokens_fn, num_beams=new_config.num_beams
+        )
+        self.assertEqual(prefix_constrained_logits_proc._num_beams, num_beams)
+
+    def test_serialize_generation_bos_token_id(self):
+        """Tests that GenerationConfig is serialized and ForcedBOSTokenLogitsProcessor is initialized with bos_token_id"""
+        bos_token_id = 0
+
+        generation_config = GenerationConfig(bos_token_id=bos_token_id)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.bos_token_id, bos_token_id)
+
+        logits_processor = ForcedBOSTokenLogitsProcessor(bos_token_id=new_config.bos_token_id)
+        self.assertEqual(logits_processor.bos_token_id, bos_token_id)
+
+    def test_serialize_generation_eos_token_id(self):
+        """Tests that GenerationConfig is serialized and ForcedEOSTokenLogitsProcessor is initialized with eos_token_id"""
+        eos_token_id = 0
+        max_length = 5
+
+        generation_config = GenerationConfig(eos_token_id=eos_token_id)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.eos_token_id, eos_token_id)
+
+        logits_processor = ForcedEOSTokenLogitsProcessor(
+            max_length=max_length, eos_token_id=new_config.eos_token_id, device=torch_device
+        )
+        self.assertEqual(logits_processor.eos_token_id, eos_token_id)
+
+    def test_serialize_generation_exponential_decay_length_penalty(self):
+        """Tests that GenerationConfig is serialized and ExponentialDecayLengthPenalty is initialized with regulation_start and regulation_factor"""
+        eos_token_id = 0
+        penalty_start = 5
+        penalty_factor = 1.1
+        input_ids_seq_length = 10
+        exponential_decay_length_penalty = (penalty_start, penalty_factor)
+
+        generation_config = GenerationConfig(exponential_decay_length_penalty=exponential_decay_length_penalty)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.exponential_decay_length_penalty, [penalty_start, penalty_factor])
+
+        exponential_decay_processor = ExponentialDecayLengthPenalty(
+            exponential_decay_length_penalty=new_config.exponential_decay_length_penalty,
+            eos_token_id=eos_token_id,
+            input_ids_seq_length=input_ids_seq_length,
+        )
+        self.assertEqual(
+            exponential_decay_processor.regulation_start, exponential_decay_length_penalty[0] + input_ids_seq_length
+        )
+        self.assertEqual(exponential_decay_processor.regulation_factor, exponential_decay_length_penalty[1])
+
+    def test_serialize_generation_begin_suppress_tokens(self):
+        """Tests that GenerationConfig is serialized and SuppressTokensAtBeginLogitsProcessor is initialized with begin_suppress_token and begin_index"""
+
+        begin_suppress_tokens = [220, 50256]
+        begin_index = 0
+        generation_config = GenerationConfig(begin_suppress_tokens=begin_suppress_tokens)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertSequenceEqual(new_config.begin_suppress_tokens, begin_suppress_tokens)
+
+        suppress_processor = SuppressTokensAtBeginLogitsProcessor(
+            begin_suppress_tokens=new_config.begin_suppress_tokens, begin_index=begin_index
+        )
+        self.assertSequenceEqual(suppress_processor.begin_suppress_tokens, begin_suppress_tokens)
+        self.assertEqual(suppress_processor.begin_index, begin_index)
+
+    def test_serialize_generation_suppress_tokens(self):
+        """Tests that GenerationConfig is serialized and SuppressTokensLogitsProcessor is initialized with suppress_token"""
+        suppress_tokens = [220, 50256]
+
+        generation_config = GenerationConfig(suppress_tokens=suppress_tokens)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertSequenceEqual(new_config.suppress_tokens, suppress_tokens)
+
+        suppress_processor = SuppressTokensLogitsProcessor(suppress_tokens=new_config.suppress_tokens)
+        self.assertSequenceEqual(suppress_processor.suppress_tokens, suppress_tokens)
+
+    def test_serialize_generation_guidance_scale(self):
+        """Tests that GenerationConfig is serialized and ClassifierFreeGuidanceLogitsProcessor is initialized with guidance_scale"""
+        guidance_scale = 2.0
+        generation_config = GenerationConfig(guidance_scale=guidance_scale)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.guidance_scale, guidance_scale)
+
+        classifier_processor = ClassifierFreeGuidanceLogitsProcessor(guidance_scale=new_config.guidance_scale)
+        self.assertEqual(classifier_processor.guidance_scale, guidance_scale)
+
+    def test_serialize_generation_guidance_scale_unbatched(self):
+        """Tests that GenerationConfig is serialized and UnbatchedClassifierFreeGuidanceLogitsProcessor is initialized with guidance_scale"""
+        guidance_scale = 2.0
+
+        input_ids = torch.LongTensor([[0]])
+
+        generation_config = GenerationConfig(guidance_scale=guidance_scale)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.guidance_scale, guidance_scale)
+
+        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(new_config.guidance_scale, {}, input_ids)
+        self.assertEqual(cfg.guidance_scale, guidance_scale)
+
+    def test_serialize_generation_watermarking_config(self):
+        """Tests that GenerationConfig is serialized and WatermarkLogitsProcessor is initialized with WatermarkingConfig parameters"""
+
+        vocab_size = 20
+        bias = 2.0
+        greenlist_ratio = 0.5
+        hashing_key = 10
+        seeding_scheme = "lefthash"
+        context_width = 10
+        watermarking_config = WatermarkingConfig(
+            bias=bias,
+            greenlist_ratio=greenlist_ratio,
+            hashing_key=hashing_key,
+            seeding_scheme=seeding_scheme,
+            context_width=context_width,
+        )
+        generation_config = GenerationConfig(watermarking_config=watermarking_config)
+
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.watermarking_config.bias, bias)
+        self.assertEqual(new_config.watermarking_config.greenlist_ratio, greenlist_ratio)
+        self.assertEqual(new_config.watermarking_config.hashing_key, hashing_key)
+        self.assertEqual(new_config.watermarking_config.seeding_scheme, seeding_scheme)
+        self.assertEqual(new_config.watermarking_config.context_width, context_width)
+
+        watermark = WatermarkLogitsProcessor(
+            vocab_size=vocab_size,
+            device=torch_device,
+            greenlist_ratio=new_config.watermarking_config.greenlist_ratio,
+            bias=new_config.watermarking_config.bias,
+            hashing_key=new_config.watermarking_config.hashing_key,
+            seeding_scheme=new_config.watermarking_config.seeding_scheme,
+            context_width=new_config.watermarking_config.context_width,
+        )
+        self.assertEqual(watermark.bias, bias)
+        self.assertEqual(watermark.greenlist_size, int(vocab_size * greenlist_ratio))
+        self.assertEqual(watermark.hash_key, hashing_key)
+        self.assertEqual(watermark.seeding_scheme, seeding_scheme)
+        self.assertEqual(watermark.context_width, context_width)
+
+
+@is_staging_test
+class ConfigPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._token = TOKEN
+
+    def test_push_to_hub(self):
+        with TemporaryHubRepo(token=self._token) as tmp_repo:
+            config = GenerationConfig(
+                do_sample=True,
+                temperature=0.7,
+                length_penalty=1.0,
+            )
+            config.push_to_hub(tmp_repo.repo_id, token=self._token)
+
+            new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id)
+            for k, v in config.to_dict().items():
+                if k != "transformers_version":
+                    self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_via_save_pretrained(self):
+        with TemporaryHubRepo(token=self._token) as tmp_repo:
+            config = GenerationConfig(
+                do_sample=True,
+                temperature=0.7,
+                length_penalty=1.0,
+            )
+            # Push to hub via save_pretrained
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                config.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
+
+            new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id)
+            for k, v in config.to_dict().items():
+                if k != "transformers_version":
+                    self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_in_organization(self):
+        with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
+            config = GenerationConfig(
+                do_sample=True,
+                temperature=0.7,
+                length_penalty=1.0,
+            )
+            config.push_to_hub(tmp_repo.repo_id, token=self._token)
+
+            new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id)
+            for k, v in config.to_dict().items():
+                if k != "transformers_version":
+                    self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_in_organization_via_save_pretrained(self):
+        with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
+            config = GenerationConfig(
+                do_sample=True,
+                temperature=0.7,
+                length_penalty=1.0,
+            )
+            # Push to hub via save_pretrained
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                config.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
+
+            new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id)
+            for k, v in config.to_dict().items():
+                if k != "transformers_version":
+                    self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_on_pr_revision(self):
+        with TemporaryHubRepo(token=self._token) as tmp_repo:
+            # create a PR
+            pr = create_pull_request(repo_id=tmp_repo.repo_id, title="Test PR", token=self._token)
+            revision = f"refs/pr/{pr.num}"
+
+            # push to PR ref
+            config = GenerationConfig(
+                do_sample=True,
+                temperature=0.7,
+                length_penalty=1.0,
+            )
+            config.push_to_hub(tmp_repo.repo_id, token=self._token, revision=revision)
+
+            # load from PR ref
+            new_config = GenerationConfig.from_pretrained(tmp_repo.repo_id, revision=revision)
+            for k, v in config.to_dict().items():
+                if k != "transformers_version":
+                    self.assertEqual(v, getattr(new_config, k))
--- a/tests/generation/test_continuous_batching.py
+++ b/tests/generation/test_continuous_batching.py
--- a/tests/generation/test_flash_attention_parity.py
+++ b/tests/generation/test_flash_attention_parity.py
@@ -0,0 +1,176 @@
+# Copyright 2025 Eduard Durech, SGLang, and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Usage:
+# RUN_SLOW=1 pytest -s tests/generation/test_flash_attention_parity.py
+
+import unittest
+from collections import defaultdict
+
+import pytest
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.testing_utils import require_all_flash_attn, require_torch_gpu, slow
+
+
+class FlashAttentionParityTest(unittest.TestCase):
+    # From https://github.com/sgl-project/sglang/blob/main/python/sglang/test/test_utils.py
+    def _lcs(self, X, Y):
+        m = len(X)
+        n = len(Y)
+        L = [[0] * (n + 1) for _ in range(m + 1)]
+
+        for i in range(m + 1):
+            for j in range(n + 1):
+                if i == 0 or j == 0:
+                    L[i][j] = 0
+                elif X[i - 1] == Y[j - 1]:
+                    L[i][j] = L[i - 1][j - 1] + 1
+                else:
+                    L[i][j] = max(L[i - 1][j], L[i][j - 1])
+
+        return L[m][n]
+
+    # From https://github.com/sgl-project/sglang/blob/main/python/sglang/test/test_utils.py
+    def _calculate_rouge_l(self, output_strs_list1, output_strs_list2):
+        rouge_l_scores = []
+
+        for s1, s2 in zip(output_strs_list1, output_strs_list2):
+            lcs_len = self._lcs(s1, s2)
+            precision = lcs_len / len(s1) if len(s1) > 0 else 0
+            recall = lcs_len / len(s2) if len(s2) > 0 else 0
+            if precision + recall > 0:
+                fmeasure = (2 * precision * recall) / (precision + recall)
+            else:
+                fmeasure = 0.0
+            rouge_l_scores.append(fmeasure)
+
+        return rouge_l_scores
+
+    def _benchmark_generation(self, model, inputs, n_warmup=3, n_runs=5):
+        for _ in range(n_warmup):
+            model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        torch.cuda.synchronize()
+
+        start_time = torch.cuda.Event(enable_timing=True)
+        end_time = torch.cuda.Event(enable_timing=True)
+
+        start_time.record()
+        for _ in range(n_runs):
+            model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        end_time.record()
+        torch.cuda.synchronize()
+
+        return start_time.elapsed_time(end_time) / n_runs
+
+    @slow
+    @require_torch_gpu
+    @require_all_flash_attn
+    @pytest.mark.all_flash_attn_test
+    def test_flash_attention_parity(self):
+        flash_attn_versions = [2, 3, 4]
+
+        model_id = "meta-llama/Llama-3.2-1B-Instruct"
+        prompt = ["The ETH AI Center is", "What is life?"]
+
+        # 1. Load model and tokenizer
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            dtype=torch.bfloat16,
+            device_map="auto",
+            attn_implementation="flash_attention_2",
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+        # 2. Generate with both models
+        inputs = tokenizer(prompt, padding=True, padding_side="left", return_tensors="pt").to("cuda")
+
+        logits = {}
+        logprobs = {}
+        outputs = defaultdict(list)
+        with torch.no_grad():
+
+            def generate(model, version, outputs, logits, logprobs):
+                model.set_attn_implementation(f"flash_attention_{version}")
+                output = model.generate(
+                    **inputs, max_new_tokens=20, do_sample=False, output_scores=True, return_dict_in_generate=True
+                )
+                logit = torch.stack(output.scores)
+                logprob = torch.nn.functional.log_softmax(logit, dim=-1)
+
+                for i in range(len(prompt)):
+                    outputs[version].append(tokenizer.decode(output.sequences[i], skip_special_tokens=True))
+                logits[version] = logit
+                logprobs[version] = logprob
+
+            for version in flash_attn_versions:
+                generate(model, version, outputs, logits, logprobs)
+
+        # 3. Correctness check
+        # 3a. Logits
+        # FA2 as base to compare against
+        logits_1 = logits[2]
+        logprobs_1 = logprobs[2]
+        max_logprob_diffs = []
+        for version in range(1, len(flash_attn_versions)):
+            logits_x = logits[flash_attn_versions[version]]
+            logprobs_x = logprobs[flash_attn_versions[version]]
+            max_logprob_diffs.append(torch.max(torch.abs(logprobs_1 - logprobs_x)).item())
+
+            # Only 80% need to pass the tolerance (big model with several steps)
+            atol, fraction = 4e-2, 0.8
+            logits_ok = (torch.abs(logits_1 - logits_x) <= atol).float().mean().item()
+            assert logits_ok >= fraction, (
+                f"FA{flash_attn_versions[version]} logits pass fraction {logits_ok:.6f} < {fraction:.6f}"
+            )
+
+        # 3b. Generated text
+        # FA2 as base to compare against
+        texts_1 = outputs[2]
+        rouge_scores = []
+        for version in range(1, len(flash_attn_versions)):
+            fa_version = flash_attn_versions[version]
+            texts_x = outputs[fa_version]
+            rouge_score = self._calculate_rouge_l(texts_1, texts_x)
+            for idx, score in enumerate(rouge_score):
+                assert score > 0.99, (
+                    f"Generated texts at prompt {idx} do not match (ROUGE-L: {score}) comparing FA2 vs FA{fa_version}"
+                )
+            rouge_scores.append(self._calculate_rouge_l(texts_1, texts_x))
+
+        # 4. Performance check
+        times = []
+        with torch.no_grad():
+            for version in flash_attn_versions:
+                model.set_attn_implementation(f"flash_attention_{version}")
+                times.append(self._benchmark_generation(model, inputs))
+
+        # Summary
+        print(f"\n--- Flash Attention Parity Test on {model_id} ---")
+        print(f"Prompts: '{prompt}'")
+        print("\nGenerated texts:")
+        for version in flash_attn_versions:
+            print(f"    With FA{version}: {outputs[version]}")
+        print("\nROUGE-L scores:")
+        for idx, version in enumerate(range(1, len(flash_attn_versions))):
+            print(f"    Between FA2 and FA{flash_attn_versions[version]}: {rouge_scores[idx]}")
+        print("\nMax absolute difference in logprobs:")
+        for idx, version in enumerate(range(1, len(flash_attn_versions))):
+            print(f"    Between FA2 and FA{flash_attn_versions[version]}: {max_logprob_diffs[idx]:.5e}")
+        print("\nLatency:")
+        for idx, version in enumerate(flash_attn_versions):
+            print(f"    With FA{version}: {times[idx]}")
+        print("---")
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
--- a/tests/generation/test_paged_attention.py
+++ b/tests/generation/test_paged_attention.py
@@ -0,0 +1,168 @@
+import time
+import unittest
+
+from parameterized import parameterized
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+from transformers.generation.configuration_utils import ContinuousBatchingConfig
+from transformers.testing_utils import Expectations, slow
+
+
+_TEST_PROMPTS = [
+    "A man is a walking his dog down the street, and a the turn he sees",
+    "Describe a fruit that is of orange color and round. It is a sweet fruit and a great source of Vitamine C. The fruit I'm thinking of is an",
+    "A plane is flying high in the sky, out of the window are clouds and mountains. Where could the plane be located?",
+    "Please fill in the form to",
+    "For safety reasons, the train is stopped in the middle of the",
+]
+
+_EXPECTED_OUTPUTS = Expectations(
+    {
+        ("cpu", None): [  # FIXME: CPU tests only pass for eager and flex. Maybe the test should be re-thought.
+            "a woman standing on the sidewalk, looking at him. He is immediately drawn to her and feels a strong attraction. He walks up to her and strikes",
+            "orange.\n\n## Step 1: Identify the key characteristics of the fruit\nThe fruit is described as being orange in color and round in shape.\n\n##",
+            "This riddle is a classic example of a lateral thinking puzzle, which requires the test-taker to think creatively and consider multiple possibilities. The answer",
+            "get in touch with us. We will respond to your message as soon as possible.\n\n[Your Name]\n[Your Email]\n[Your Phone Number]",
+            "track. The train is stopped because of a mechanical failure. The train is stopped because of a mechanical failure. The train is stopped because of a mechanical",
+            # TODO: investigate why that last expectation seems incorrect
+        ],
+        ("cuda", (9, 0)): [  # A10 and H100
+            "a woman standing on the sidewalk, looking at him. He is immediately drawn to her and feels a strong attraction. He walks up to her and strikes",
+            "orange.\n\n## Step 1: Identify the key characteristics of the fruit\nThe fruit is described as being orange in color and round in shape.\n\n##",
+            "This riddle is a classic example of a lateral thinking puzzle, which requires the test-taker to think creatively and consider multiple possibilities. The answer",
+            "get in touch with us. We will respond to your message as soon as possible.\n\n[Your Name]\n[Your Email]\n[Your Phone Number]",
+            # The last prompt sits on a numerical boundary: eager/flex produce "does", sdpa/fa2 produce "will".
+            # We use a tuple to accept either variant.
+            (
+                "track. The train is stopped for 30 minutes. The train is moving at a speed of 60 km/h. How many kilometers does the train",
+                "track. The train is stopped for 30 minutes. The train is moving at a speed of 60 km/h. How many kilometers will the train",
+            ),
+        ],
+    }
+)
+
+
+@slow
+class TestBatchGeneration(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = AutoModelForCausalLM.from_pretrained(
+            "meta-llama/Llama-3.2-3b-Instruct", dtype="bfloat16", device_map="cuda"
+        ).eval()
+
+        cls.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3b-Instruct", padding_side="left")
+
+        if cls.tokenizer.pad_token is None:
+            cls.tokenizer.pad_token = cls.tokenizer.eos_token
+            eos_id = cls.model.config.eos_token_id
+            cls.model.config.pad_token_id = eos_id[0] if isinstance(eos_id, list) else eos_id
+
+        cls.model.use_cache = False
+
+    @parameterized.expand(
+        [
+            ("paged|eager", 64, 128, 64),
+            ("paged|sdpa", 32, 256, 128),
+            ("paged|flash_attention_2", 16, 512, 256),
+            ("paged|flex_attention", 64, 128, 64),
+        ]
+    )
+    def test_generate_batch_consistency(self, attn_impl, num_blocks, block_size, max_batch_tokens):
+        self.model.config.attn_implementation = attn_impl
+
+        cb_config = ContinuousBatchingConfig(
+            num_blocks=num_blocks,
+            block_size=block_size,
+            max_batch_tokens=max_batch_tokens,
+        )
+        generation_config = GenerationConfig(
+            max_new_tokens=30,
+            top_k=0,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=False,
+        )
+
+        tokenized = self.tokenizer(_TEST_PROMPTS, truncation=True, max_length=512)
+        batch_inputs = list(tokenized["input_ids"])
+
+        batch_outputs = self.model.generate_batch(
+            inputs=batch_inputs,
+            generation_config=generation_config,
+            continuous_batching_config=cb_config,
+        )
+
+        expected_outputs = _EXPECTED_OUTPUTS.get_expectation()
+
+        for i, (output, expected_output) in enumerate(zip(batch_outputs.values(), expected_outputs)):
+            generated = self.tokenizer.decode(output.generated_tokens, skip_special_tokens=False).strip()
+            expected_output = (expected_output.strip(),) if isinstance(expected_output, str) else expected_output
+            self.assertIn(
+                generated,
+                [e.strip() for e in expected_output],
+                msg=f"[{attn_impl}] Mismatch in request {i}:\nExpected one of: {expected_output}\nGot: {generated}",
+            )
+
+    @parameterized.expand(
+        [
+            ("paged|eager", 64, 128, 64),
+            ("paged|sdpa", 32, 256, 128),
+            ("paged|flash_attention_2", 16, 512, 256),
+            ("paged|flex_attention", 64, 128, 64),
+        ]
+    )
+    def test_generate_batch_with_sampling(self, attn_impl, num_blocks, block_size, max_batch_tokens):
+        """Test batch generation with do_sampling=True to verify sampling works correctly."""
+        self.model.config.attn_implementation = attn_impl
+
+        cb_config = ContinuousBatchingConfig(
+            num_blocks=num_blocks,
+            block_size=block_size,
+            max_batch_tokens=max_batch_tokens,
+        )
+        generation_config = GenerationConfig(
+            max_new_tokens=30,
+            do_sample=True,
+            top_k=50,
+            top_p=0.9,
+            temperature=0.8,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=False,
+        )
+
+        tokenized = self.tokenizer(_TEST_PROMPTS, truncation=True, max_length=512)  # Use fewer prompts for faster test
+        batch_inputs = list(tokenized["input_ids"])
+
+        start = time.time()
+        batch_outputs = self.model.generate_batch(
+            inputs=batch_inputs,
+            generation_config=generation_config,
+            continuous_batching_config=cb_config,
+        )
+        end = time.time()
+        print(
+            f"\n[{attn_impl}] Sampling batch took {end - start:.2f}s with config: blocks={num_blocks}, block_size={block_size}, max_batch_tokens={max_batch_tokens}"
+        )
+
+        # With sampling enabled, we can't check exact outputs, but we should verify:
+        # 1. All requests completed successfully
+        # 2. Generated text is non-empty
+        # 3. Generated text is different from greedy (demonstrating sampling is working)
+        self.assertEqual(len(batch_outputs), len(batch_inputs), f"[{attn_impl}] Not all requests completed")
+
+        for i, req_id in enumerate(batch_outputs):
+            generated = self.tokenizer.decode(
+                batch_outputs[req_id].generated_tokens, skip_special_tokens=False
+            ).strip()
+            self.assertTrue(
+                len(generated) > 0,
+                msg=f"[{attn_impl}] Empty output for request {i}",
+            )
+            # Check that we got at least some tokens generated
+            generated_tokens = batch_outputs[req_id].generated_tokens
+            self.assertGreater(
+                len(generated_tokens),
+                0,
+                msg=f"[{attn_impl}] No tokens generated for request {i}",
+            )
--- a/tests/generation/test_stopping_criteria.py
+++ b/tests/generation/test_stopping_criteria.py
@@ -0,0 +1,289 @@
+# Copyright 2020 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+
+from transformers import AutoTokenizer, is_torch_available
+from transformers.testing_utils import require_torch, torch_device
+
+from ..test_modeling_common import ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers.generation import (
+        ConfidenceCriteria,
+        EosTokenCriteria,
+        MaxLengthCriteria,
+        MaxTimeCriteria,
+        StoppingCriteriaList,
+        StopStringCriteria,
+        validate_stopping_criteria,
+    )
+
+
+@require_torch
+class StoppingCriteriaTestCase(unittest.TestCase):
+    def _get_tensors(self, length):
+        batch_size = 3
+        vocab_size = 250
+
+        input_ids = ids_tensor((batch_size, length), vocab_size)
+        scores = torch.ones((batch_size, length), device=torch_device, dtype=torch.float) / length
+        return input_ids, scores
+
+    def test_list_criteria(self):
+        input_ids, scores = self._get_tensors(5)
+
+        criteria = StoppingCriteriaList(
+            [
+                MaxLengthCriteria(max_length=10),
+                MaxTimeCriteria(max_time=0.1),
+            ]
+        )
+
+        self.assertFalse(all(criteria(input_ids, scores)))
+
+        input_ids, scores = self._get_tensors(9)
+        self.assertFalse(all(criteria(input_ids, scores)))
+
+        input_ids, scores = self._get_tensors(10)
+        self.assertTrue(all(criteria(input_ids, scores)))
+
+    def test_max_length_criteria(self):
+        criteria = MaxLengthCriteria(max_length=10)
+
+        input_ids, scores = self._get_tensors(5)
+        self.assertFalse(all(criteria(input_ids, scores)))
+
+        input_ids, scores = self._get_tensors(9)
+        self.assertFalse(all(criteria(input_ids, scores)))
+
+        input_ids, scores = self._get_tensors(10)
+        self.assertTrue(all(criteria(input_ids, scores)))
+
+    def test_max_time_criteria(self):
+        input_ids, scores = self._get_tensors(5)
+
+        criteria = MaxTimeCriteria(max_time=0.1)
+        self.assertFalse(all(criteria(input_ids, scores)))
+
+        criteria = MaxTimeCriteria(max_time=0.1, initial_timestamp=time.time() - 0.2)
+        self.assertTrue(all(criteria(input_ids, scores)))
+
+    def test_eos_token_criteria(self):
+        criteria = EosTokenCriteria(eos_token_id=0)
+
+        input_ids, scores = self._get_tensors(5)
+        input_ids[:, -1] = 0
+        self.assertTrue(all(criteria(input_ids, scores)))
+
+        input_ids, scores = self._get_tensors(5)
+        input_ids[:2, -1] = 0
+        input_ids[2, -1] = 1
+        self.assertListEqual(criteria(input_ids, scores).tolist(), [True, True, False])
+
+        input_ids, scores = self._get_tensors(5)
+        input_ids[:, -1] = 1
+        self.assertListEqual(criteria(input_ids, scores).tolist(), [False, False, False])
+
+    def test_confidence_criteria(self):
+        criteria = ConfidenceCriteria(assistant_confidence_threshold=0.5)
+
+        vocab_size = 250
+        length = 5
+
+        input_ids = ids_tensor((1, length), vocab_size)
+        scores = (torch.randn((1, vocab_size)),)
+
+        # Simulate high confidence by setting the probability of the last token to be high
+        scores[0][0, input_ids[0, -1]] = 10.0  # Logits before softmax
+        self.assertFalse(criteria(input_ids, scores))
+
+        # Simulate low confidence by setting the probability of the last token to be low
+        scores[0][0, input_ids[0, -1]] = -10.0  # Logits before softmax
+        self.assertTrue(criteria(input_ids, scores))
+
+    def test_validate_stopping_criteria(self):
+        validate_stopping_criteria(StoppingCriteriaList([MaxLengthCriteria(10)]), 10)
+
+        with self.assertWarns(UserWarning):
+            validate_stopping_criteria(StoppingCriteriaList([MaxLengthCriteria(10)]), 11)
+
+        stopping_criteria = validate_stopping_criteria(StoppingCriteriaList(), 11)
+
+        self.assertEqual(len(stopping_criteria), 1)
+
+    def test_stop_string_criteria(self):
+        true_strings = [
+            "<|im_start|><|im_end|>",
+            "<|im_start|><|im_end|<|im_end|>",
+            ">><|im_start|>>stop",
+            "stop",
+            "e nd",
+        ]
+        false_strings = [
+            "<|im_start|><|im_end|",
+            "<|im_start|><|im_end|<|im_end|",
+            "<|im_end|><|im_start|>",
+            "<|im_end|<>stop<|im_end|",
+            "end",
+            "en d",
+            "eNd",
+            "<|im_end|",
+            "|im_end|>",
+            "s",
+        ]
+        stop_strings = ["<|im_end|>", "stop", "e nd"]
+
+        # Use a tokenizer that won't actually have special tokens for these
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+        true_input_ids = tokenizer(true_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
+        false_input_ids = tokenizer(false_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
+
+        scores = None
+        criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings)
+        for i in range(len(true_strings)):
+            self.assertTrue(criteria(true_input_ids["input_ids"][i : i + 1], scores))
+        for i in range(len(false_strings)):
+            self.assertFalse(criteria(false_input_ids["input_ids"][i : i + 1], scores))
+
+        # Now try it with a tokenizer where those are actually special tokens
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+        tokenizer.padding_side = "left"
+        true_input_ids = tokenizer(true_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
+        false_input_ids = tokenizer(false_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
+
+        criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings)
+        for i in range(len(true_strings)):
+            self.assertTrue(criteria(true_input_ids["input_ids"][i : i + 1], scores))
+        for i in range(len(false_strings)):
+            self.assertFalse(criteria(false_input_ids["input_ids"][i : i + 1], scores))
+
+    def test_stop_string_criteria_vocab_size_mismatch(self):
+        """Test that StopStringCriteria handles tokens above len(tokenizer) correctly."""
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+
+        # Create input_ids with tokens above len(tokenizer)
+        input_ids = torch.tensor([[len(tokenizer) + 1024, 1, 2]], device=torch_device)
+        scores = None
+        criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=["test"])
+
+        # This should not raise an error and should return False since no stop string is matched
+        self.assertFalse(criteria(input_ids, scores))
+
+    def test_stop_string_matching_positions(self):
+        stop_string = "stop"
+        token_list = ["last", "top", "topper", "s", "p"]
+        token_indices = list(range(len(token_list)))
+        all_token_valid_positions, all_token_end_overlaps = StopStringCriteria._stop_string_get_matching_positions(
+            token_list=token_list, token_indices=token_indices, stop_strings=[stop_string]
+        )
+        valid_positions = {
+            token_list[idx]: positions for idx, positions in all_token_valid_positions[stop_string].items()
+        }
+        end_overlaps = {token_list[idx]: overlaps for idx, overlaps in all_token_end_overlaps[stop_string].items()}
+        self.assertEqual(valid_positions, {"s": [3], "last": [2]})
+        self.assertEqual(end_overlaps, {"top": [3], "topper": [3], "p": [1]})
+
+    def test_stop_string_embedding_vecs(self):
+        stop_string = "stop"
+        token_list = ["last", "top", "topper", "s", "p"]
+        token_indices = list(range(len(token_list)))
+        embedding_vec, max_valid_positions, max_valid_end_lens = StopStringCriteria._stop_string_create_embedding_vec(
+            token_list=token_list, token_indices=token_indices, stop_strings=[stop_string]
+        )
+
+        # Positions inside the stop string where the token matches (excluding end overlaps)
+        valid_positions = embedding_vec[:, 0].tolist()
+        self.assertEqual(valid_positions, [2, -1, -1, 3, -1, -1])
+
+        # Overlap lengths between end of stop string and start of token
+        end_overlaps = embedding_vec[:, 1].tolist()
+        self.assertEqual(end_overlaps, [-1, 3, 3, -1, 1, -1])
+
+        # Length of each token
+        token_lengths = embedding_vec[:-1, 2].tolist()
+        self.assertEqual(token_lengths, [len(token) for token in token_list])
+
+    def test_single_letter_stop_string(self):
+        true_strings = ["a", "baa", "abc"]  # "abc" is a single token
+        false_strings = ["abbbbbbb", "b"]  # "abbbbbbb" is split into multiple tokens
+        stop_strings = ["a"]
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", add_prefix_space=False)
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+
+        true_input_ids = tokenizer(true_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
+        false_input_ids = tokenizer(false_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
+
+        scores = None
+        criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings)
+        for input_ids in true_input_ids["input_ids"]:
+            self.assertTrue(criteria(input_ids.unsqueeze(0), scores))
+        for input_ids in false_input_ids["input_ids"]:
+            self.assertFalse(criteria(input_ids.unsqueeze(0), scores))
+
+    def test_criteria_per_row(self):
+        text = "They completed the challenging puzzle, revealing the hidden image at the end"
+        stop_strings = ["end"]
+
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
+
+        scores = None
+        criteria = StoppingCriteriaList(
+            [
+                MaxLengthCriteria(max_length=20),
+                StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings),
+            ]
+        )
+
+        # trigger stopping when at least one criteria is satisfied, one value per batch
+        self.assertTrue(criteria(inputs["input_ids"], scores))
+
+        # return False when neither is satisfied
+        self.assertFalse(criteria(inputs["input_ids"][:, :-1], scores))
+
+    def test_criteria_per_row_batched(self):
+        text = [
+            "They completed the challenging puzzle, revealing the hidden image at the end",
+            "Today a dragon flew over France",
+            "The aroma of freshly baked pizza filled the kitchen",
+        ]
+        stop_strings = ["end"]
+
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+        inputs = tokenizer(text, return_tensors="pt", padding="longest", add_special_tokens=False)
+
+        scores = None
+        criteria = StoppingCriteriaList(
+            [
+                MaxLengthCriteria(max_length=20),
+                StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings),
+            ]
+        )
+
+        # trigger stopping when at least one criteria is satisfied
+        self.assertListEqual(criteria(inputs["input_ids"], scores).tolist(), [True, False, False])
+
+        # False when neither is satisfied
+        self.assertListEqual(criteria(inputs["input_ids"][:, :-1], scores).tolist(), [False, False, False])
--- a/tests/generation/test_streamers.py
+++ b/tests/generation/test_streamers.py
@@ -0,0 +1,174 @@
+# Copyright 2023 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from queue import Empty
+from threading import Thread
+from unittest.mock import patch
+
+import pytest
+
+from transformers import (
+    AsyncTextIteratorStreamer,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    TextStreamer,
+    is_torch_available,
+)
+from transformers.testing_utils import CaptureStdout, require_torch, torch_device
+from transformers.utils.logging import _get_library_root_logger
+
+from ..test_modeling_common import ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoModelForCausalLM
+
+
+@require_torch
+class StreamerTester(unittest.TestCase):
+    def test_text_streamer_matches_non_streaming(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
+        greedy_text = tokenizer.decode(greedy_ids[0])
+
+        with CaptureStdout() as cs:
+            streamer = TextStreamer(tokenizer)
+            model.generate(input_ids, max_new_tokens=10, do_sample=False, streamer=streamer)
+        # The greedy text should be printed to stdout, except for the final "\n" in the streamer
+        streamer_text = cs.out[:-1]
+
+        self.assertEqual(streamer_text, greedy_text)
+
+    def test_iterator_streamer_matches_non_streaming(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
+        greedy_text = tokenizer.decode(greedy_ids[0])
+
+        streamer = TextIteratorStreamer(tokenizer)
+        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        streamer_text = ""
+        for new_text in streamer:
+            streamer_text += new_text
+
+        self.assertEqual(streamer_text, greedy_text)
+
+    def test_text_streamer_skip_prompt(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
+        new_greedy_ids = greedy_ids[:, input_ids.shape[1] :]
+        new_greedy_text = tokenizer.decode(new_greedy_ids[0])
+
+        with CaptureStdout() as cs:
+            streamer = TextStreamer(tokenizer, skip_prompt=True)
+            model.generate(input_ids, max_new_tokens=10, do_sample=False, streamer=streamer)
+        # The greedy text should be printed to stdout, except for the final "\n" in the streamer
+        streamer_text = cs.out[:-1]
+
+        self.assertEqual(streamer_text, new_greedy_text)
+
+    def test_text_streamer_decode_kwargs(self):
+        # Tests that we can pass `decode_kwargs` to the streamer to control how the tokens are decoded. Must be tested
+        # with actual models -- the dummy models' tokenizers are not aligned with their models, and
+        # `skip_special_tokens=True` has no effect on them
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+        model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = torch.ones((1, 5), device=torch_device).long() * model.config.bos_token_id
+
+        root = _get_library_root_logger()
+        with patch.object(root, "propagate", False):
+            with CaptureStdout() as cs:
+                streamer = TextStreamer(tokenizer, skip_special_tokens=True)
+                model.generate(input_ids, max_new_tokens=1, do_sample=False, streamer=streamer)
+
+        # The prompt contains a special token, so the streamer should not print it. As such, the output text, when
+        # re-tokenized, must only contain one token
+        streamer_text = cs.out[:-1]  # Remove the final "\n"
+        streamer_text_tokenized = tokenizer(streamer_text, return_tensors="pt")
+        self.assertEqual(streamer_text_tokenized.input_ids.shape, (1, 1))
+
+    def test_iterator_streamer_timeout(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        streamer = TextIteratorStreamer(tokenizer, timeout=0.001)
+        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        # The streamer will timeout after 0.001 seconds, so an exception will be raised
+        with self.assertRaises(Empty):
+            streamer_text = ""
+            for new_text in streamer:
+                streamer_text += new_text
+
+
+@require_torch
+@pytest.mark.asyncio(loop_scope="class")
+class AsyncStreamerTester(unittest.IsolatedAsyncioTestCase):
+    async def test_async_iterator_streamer_matches_non_streaming(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
+        greedy_text = tokenizer.decode(greedy_ids[0])
+
+        streamer = AsyncTextIteratorStreamer(tokenizer)
+        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        streamer_text = ""
+        async for new_text in streamer:
+            streamer_text += new_text
+
+        self.assertEqual(streamer_text, greedy_text)
+
+    async def test_async_iterator_streamer_timeout(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        model.config.eos_token_id = -1
+
+        input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device)
+        streamer = AsyncTextIteratorStreamer(tokenizer, timeout=0.001)
+        generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer}
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        # The streamer will timeout after 0.001 seconds, so TimeoutError will be raised
+        with self.assertRaises(TimeoutError):
+            streamer_text = ""
+            async for new_text in streamer:
+                streamer_text += new_text
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
--- a/tests/kernels/test_kernels.py
+++ b/tests/kernels/test_kernels.py
@@ -0,0 +1,538 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Run the test: CUDA_VISIBLE_DEVICES=0 RUN_SLOW=1 pytest -sv tests/kernels/test_kernels.py
+
+
+import copy
+import os
+import tempfile
+import types
+from unittest.mock import MagicMock, patch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, KernelConfig
+from transformers.integrations.hub_kernels import (
+    _HUB_KERNEL_MAPPING,
+    _KERNEL_MODULE_MAPPING,
+    is_kernel,
+    lazy_load_kernel,
+    load_and_register_attn_kernel,
+)
+from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from transformers.testing_utils import (
+    TestCasePlus,
+    cleanup,
+    require_kernels,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
+from transformers.utils.import_utils import is_kernels_available
+
+
+if is_kernels_available():
+    import kernels as kernels_pkg
+    from kernels import Device, Mode, kernelize
+
+    import transformers.integrations.hub_kernels as hub_kernels_pkg
+
+
+@require_kernels
+@slow
+class TestHubKernels(TestCasePlus):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_id = "unsloth/Llama-3.2-1B-Instruct"
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_id)
+        cls.model_kernelized = AutoModelForCausalLM.from_pretrained(
+            cls.model_id, use_kernels=True, device_map=torch_device
+        )
+        cls.model_not_kernelized = AutoModelForCausalLM.from_pretrained(
+            cls.model_id, use_kernels=False, device_map=torch_device
+        )
+        cls.input = "Hello"
+
+    @classmethod
+    def tearDownClass(cls):
+        for attr in [
+            "model_kernelized",
+            "model_not_kernelized",
+            "tokenizer",
+        ]:
+            if hasattr(cls, attr):
+                try:
+                    delattr(cls, attr)
+                except Exception as e:
+                    print(f"Could not delete attribute {attr}: {e}")
+
+        # Clear any temporary kernel module cache entries populated by tests
+        try:
+            keys_to_remove = [
+                k for k, v in list(_KERNEL_MODULE_MAPPING.items()) if v is None or isinstance(v, types.ModuleType)
+            ]
+            for k in keys_to_remove:
+                _KERNEL_MODULE_MAPPING.pop(k, None)
+        except Exception as e:
+            print(f"Could not clear kernel module cache: {e}")
+
+    def tearDown(self):
+        # Free accelerator memory/cache and trigger GC
+        cleanup(torch_device, gc_collect=True)
+
+    @require_torch_accelerator
+    def test_forward(self):
+        tokenized_input = self.tokenizer(self.input, return_tensors="pt").input_ids.to(self.model_kernelized.device)
+        output_ = self.model_kernelized.generate(tokenized_input, max_new_tokens=10, do_sample=False)
+        output = self.tokenizer.decode(output_[0], skip_special_tokens=True)
+
+        self.EXPECTED_OUTPUT = set()
+        self.EXPECTED_OUTPUT.add("Hello, I'm looking for a reliable and trustworthy online")
+        self.EXPECTED_OUTPUT.add("Hello! I'm excited to be a part of this")
+
+        self.assertTrue(output in self.EXPECTED_OUTPUT)
+
+    def test_getter_use_kernels(self):
+        self.assertTrue(self.model_kernelized.use_kernels)
+        self.assertFalse(self.model_not_kernelized.use_kernels)
+
+    def assert_kernelized_forward_is_different(self, kernelized_model, not_kernelized_model):
+        """
+        Iterate over modules and check if the forward method is different between
+        the kernelized and not kernelized models. Break on first difference, else continue.
+        Finally, assert that at least one forward is different.
+        """
+        found_difference = False
+        for (name1, module1), (name2, module2) in zip(
+            kernelized_model.named_modules(), not_kernelized_model.named_modules()
+        ):
+            # Only compare modules with the same name
+            if name1 != name2:
+                continue
+            # Check if both modules have a 'forward' attribute
+            if hasattr(module1, "forward") and hasattr(module2, "forward"):
+                # Compare the code objects of the forward methods
+                code1 = getattr(module1.forward, "__code__", None)
+                code2 = getattr(module2.forward, "__code__", None)
+                if code1 is not None and code2 is not None:
+                    if code1 is not code2:
+                        found_difference = True
+                        break
+        self.assertTrue(
+            found_difference,
+            "No module's forward method was different between kernelized and not kernelized models.",
+        )
+
+    def assert_kernelized_forward_is_the_same(self, model_1, model_2):
+        """
+        Iterate over modules and check if the forward method is the same between
+        the kernelized and not kernelized models. Break on first difference, else continue.
+        Finally, assert that at least one forward is the same.
+        """
+        no_difference = True
+        for (name1, module1), (name2, module2) in zip(model_1.named_modules(), model_2.named_modules()):
+            # Only compare modules with the same name
+            if name1 != name2:
+                continue
+            # Check if both modules have a 'forward' attribute
+            if hasattr(module1, "forward") and hasattr(module2, "forward"):
+                # Compare the code objects of the forward methods
+                code1 = getattr(module1.forward, "__code__", None)
+                code2 = getattr(module2.forward, "__code__", None)
+                if code1 is not None and code2 is not None:
+                    if code1 != code2:
+                        no_difference = False
+                        break
+        self.assertTrue(
+            no_difference,
+            "All module's forward methods were the same between the two models",
+        )
+
+    def test_kernelize(self):
+        model = copy.deepcopy(self.model_not_kernelized)
+        kernelize(model, mode=Mode.INFERENCE, device=Device(type=model.device.type))  # type: ignore[arg-type]
+        self.assert_kernelized_forward_is_different(model, self.model_not_kernelized)
+        self.assert_kernelized_forward_is_the_same(model, self.model_kernelized)
+        del model
+
+    def test_setter_use_kernels(self):
+        model = copy.deepcopy(self.model_not_kernelized)
+        model.use_kernels = True
+        self.assertTrue(model.use_kernels)
+        self.assert_kernelized_forward_is_different(model, self.model_not_kernelized)
+        self.assert_kernelized_forward_is_the_same(model, self.model_kernelized)
+        del model
+
+    def test_unkernelize(self):
+        model = copy.deepcopy(self.model_kernelized)
+
+        with self.assertLogs("transformers.modeling_utils", level="WARNING") as cm:
+            model.use_kernels = False
+
+        self.assertTrue(
+            any(
+                "Disabling kernels at runtime is a no-op as there is no 'unkernelize' routine; keeping current kernels active."
+                in msg
+                for msg in cm.output
+            )
+        )
+
+        self.assertFalse(model.use_kernels)
+        del model
+
+    def test_kernels_mapping(self):
+        kernel_config = KernelConfig(kernel_mapping={"RMSNorm": "kernels-community/layer_norm:LlamaRMSNorm"})
+        model = AutoModelForCausalLM.from_pretrained(
+            "unsloth/Llama-3.2-1B-Instruct", use_kernels=True, device_map=torch_device, kernel_config=kernel_config
+        )
+
+        EXPECTED_OUTPUT = set()
+        EXPECTED_OUTPUT.add("Hello, I'm looking for a reliable and trustworthy online")
+
+        tokenized_input = self.tokenizer(self.input, return_tensors="pt").input_ids.to(model.device)
+        output = model.generate(tokenized_input, max_new_tokens=10, do_sample=False)
+        output = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        self.assertTrue(output in EXPECTED_OUTPUT)
+
+        del model
+
+    def test_faulty_kernel_mapping_layer_name(self):
+        kernel_config = KernelConfig(kernel_mapping={"RMSNorm1": "kernels-community/layer_norm:LlamaRMSNorm"})
+        with self.assertRaises(ValueError):
+            _ = AutoModelForCausalLM.from_pretrained(
+                "unsloth/Llama-3.2-1B-Instruct", use_kernels=True, device_map=torch_device, kernel_config=kernel_config
+            )
+
+    def test_faulty_kernel_mapping_type(self):
+        kernel_config = KernelConfig(kernel_mapping={"RMSNorm": 1})
+        with self.assertRaises(ValueError):
+            _ = AutoModelForCausalLM.from_pretrained(
+                "unsloth/Llama-3.2-1B-Instruct", use_kernels=True, device_map=torch_device, kernel_config=kernel_config
+            )
+
+
+@require_kernels
+class TestKernelsEnv(TestCasePlus):
+    def test_disable_hub_kernels(self):
+        with patch.dict(os.environ, {"USE_HUB_KERNELS": "OFF"}):
+            import importlib
+
+            from transformers.integrations import hub_kernels
+
+            importlib.reload(hub_kernels)
+
+            self.assertFalse(hub_kernels._kernels_enabled)
+
+    def test_enable_hub_kernels(self):
+        with patch.dict(os.environ, {"USE_HUB_KERNELS": "ON"}):
+            import importlib
+
+            from transformers.integrations import hub_kernels
+
+            importlib.reload(hub_kernels)
+
+            self.assertTrue(hub_kernels._kernels_enabled)
+
+
+@require_kernels
+class TestKernelUtilities(TestCasePlus):
+    def test_is_kernel_regex(self):
+        valid = [
+            "org/model",
+            "org/model@main",
+            "org/model:my_func",
+            "org/model@v1.2.3:my_func",
+            "flash|org/model@rev:fn",
+        ]
+        invalid = [
+            "org//model",
+            "org/model:too:many",
+            "org/model@rev:fn:extra",
+            "/org/model",
+            "org:model",
+        ]
+        for s in valid:
+            self.assertTrue(is_kernel(s.split("|")[-1]))
+        for s in invalid:
+            self.assertFalse(is_kernel(s))
+
+    def test_lazy_load_kernel_success_and_cache(self):
+        sentinel = types.SimpleNamespace(name="sentinel")
+
+        original_get_kernel = getattr(kernels_pkg, "get_kernel")
+        try:
+
+            def fake_get_kernel(repo_id, revision=None, version=None):
+                self.assertIn(repo_id, {"kernels-community/causal-conv1d"})
+                return sentinel
+
+            setattr(hub_kernels_pkg, "get_kernel", fake_get_kernel)
+            _KERNEL_MODULE_MAPPING.pop("causal-conv1d", None)
+
+            mod1 = lazy_load_kernel("causal-conv1d")
+            self.assertIs(mod1, sentinel)
+            mod2 = lazy_load_kernel("causal-conv1d")
+            self.assertIs(mod2, sentinel)
+        finally:
+            setattr(kernels_pkg, "get_kernel", original_get_kernel)
+            # Ensure cache is cleared to avoid holding onto module references across tests
+            _KERNEL_MODULE_MAPPING.pop("causal-conv1d", None)
+
+    def test_lazy_load_kernel_unknown(self):
+        name = "unknown-kernel-name"
+        _KERNEL_MODULE_MAPPING.pop(name, None)
+        mod = lazy_load_kernel(name)
+        self.assertIsNone(mod)
+        self.assertIn(name, _KERNEL_MODULE_MAPPING)
+        # Cleanup cache entry to avoid growth across tests
+        _KERNEL_MODULE_MAPPING.pop(name, None)
+
+    def test_lazy_load_kernel_version(self):
+        HUB = _HUB_KERNEL_MAPPING
+        name = "causal-conv1d"
+        version_spec = ">=0.0.4,<0.1.0"
+        original_get_kernel = getattr(kernels_pkg, "get_kernel")
+        original_entry = HUB.get(name, None)
+
+        # Use a real ModuleType so caching short-circuits on the second call
+        sentinel_mod = types.ModuleType("sentinel_kernel_module")
+        call_count = {"n": 0}
+
+        try:
+            # Inject dict-style mapping with repo_id and version
+            HUB[name] = {"repo_id": "kernels-community/causal-conv1d", "version": version_spec}  # type: ignore[assignment]
+            _KERNEL_MODULE_MAPPING.pop(name, None)
+
+            def fake_get_kernel(repo_id, revision=None, version=None):
+                call_count["n"] += 1
+                self.assertEqual(repo_id, "kernels-community/causal-conv1d")
+                self.assertIsNone(revision, "revision must not be set when version is provided")
+                self.assertEqual(version, version_spec)
+                return sentinel_mod
+
+            # Patch kernels.get_kernel so lazy_load_kernel picks it up on import
+            setattr(hub_kernels_pkg, "get_kernel", fake_get_kernel)
+
+            # Act
+            mod1 = lazy_load_kernel(name)
+            mod2 = lazy_load_kernel(name)
+
+            # Assert
+            self.assertIs(mod1, sentinel_mod)
+            self.assertIs(mod2, sentinel_mod)
+            self.assertEqual(call_count["n"], 1, "second call should hit the cache")
+        finally:
+            # Restore patched function and mapping to avoid side effects
+            setattr(kernels_pkg, "get_kernel", original_get_kernel)
+            if original_entry is None:
+                HUB.pop(name, None)
+            else:
+                HUB[name] = original_entry
+            _KERNEL_MODULE_MAPPING.pop(name, None)
+
+
+@require_kernels
+class TestAttentionKernelRegistration(TestCasePlus):
+    def test_trust_remote_code_for_attention_kernels(self):
+        """
+        Test that using an untrusted kernel (any repo outside `kernels-community`) as attention requires
+        passing an expplicit `allow_all_kernels=True`
+        """
+        from transformers import LlamaConfig, LlamaModel
+
+        config = LlamaConfig(num_hidden_layers=2, hidden_size=32, intermediate_size=64, vocab_size=100)
+        model = LlamaModel(copy.deepcopy(config))
+        untrusted_kernel = "untrusted/flash_attention_2"
+        trusted_kernel = "kernels-community/flash-attn2"
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+
+            # Test that an untrusted kernel will raise an error without the flag
+            with self.assertRaisesRegex(
+                ValueError,
+                "You need to specify `allow_all_kernels=True` to use kernels outside of the `kernels-community` repository",
+            ):
+                _ = LlamaModel.from_pretrained(tmpdirname, attn_implementation=untrusted_kernel)
+
+            def dummy_lazy_import(*args, **kwargs):
+                pass
+
+            # Test that it works with the flag - though the repo does not exist, so patch the dispatch
+            with patch("transformers.modeling_utils.lazy_import_flash_attention", dummy_lazy_import):
+                model = LlamaModel.from_pretrained(
+                    tmpdirname, attn_implementation=untrusted_kernel, allow_all_kernels=True
+                )
+                self.assertEqual(model.config._attn_implementation, untrusted_kernel)
+
+            # Test that a trusted kernel does not need trust_remote_code
+            model = LlamaModel.from_pretrained(tmpdirname, attn_implementation=trusted_kernel)
+            self.assertEqual(model.config._attn_implementation, trusted_kernel)
+
+    def test_load_and_register_flash_attn_like_kernel(self):
+        kernel_obj = types.SimpleNamespace(flash_attn_varlen_func=lambda *a, **k: None)
+
+        with (
+            patch("transformers.integrations.hub_kernels.get_kernel", return_value=kernel_obj),
+            patch("transformers.modeling_flash_attention_utils.lazy_import_flash_attention", return_value=None),
+        ):
+            attn_impl = "org/model"
+            load_and_register_attn_kernel(attn_impl)
+            self.assertIn(attn_impl, ALL_ATTENTION_FUNCTIONS.valid_keys())
+            # Cleanup registration to avoid leaking functions across tests
+            try:
+                ALL_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception as e:
+                print(f"Could not clean up `ALL_ATTENTION_FUNCTIONS`: {e}")
+            try:
+                ALL_MASK_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception as e:
+                print(f"Could not clean up `ALL_MASK_ATTENTION_FUNCTIONS`: {e}")
+
+    def test_load_and_register_named_function_kernel(self):
+        def my_attention(*args, **kwargs):
+            return None
+
+        kernel_obj = types.SimpleNamespace(my_func=my_attention)
+        with patch("transformers.integrations.hub_kernels.get_kernel", return_value=kernel_obj):
+            attn_impl = "org/model:my_func"
+            load_and_register_attn_kernel(attn_impl)
+            self.assertIn(attn_impl, ALL_ATTENTION_FUNCTIONS.valid_keys())
+            # Cleanup registration to avoid leaking functions across tests
+            try:
+                ALL_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception as e:
+                print(f"Could not clean up `ALL_ATTENTION_FUNCTIONS`: {e}")
+            try:
+                ALL_MASK_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception as e:
+                print(f"Could not clean up `ALL_MASK_ATTENTION_FUNCTIONS`: {e}")
+
+
+@require_kernels
+class TestUseKernelsLifecycle(TestCasePlus):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_id = "unsloth/Llama-3.2-1B-Instruct"
+        cls.model = AutoModelForCausalLM.from_pretrained(cls.model_id, use_kernels=False, device_map=torch_device)
+
+    @classmethod
+    def tearDownClass(cls):
+        # Delete large objects to drop references early
+        if hasattr(cls, "model"):
+            try:
+                del cls.model
+            except Exception as e:
+                print(f"Could not delete model: {e}")
+
+    def tearDown(self):
+        # Free accelerator memory/cache and trigger GC
+        cleanup(torch_device, gc_collect=True)
+
+    def test_setting_use_kernels_twice_does_not_rekernelize(self):
+        call_count = {"n": 0}
+
+        def spy_kernelize(*args, **kwargs):
+            call_count["n"] += 1
+
+        with patch.object(kernels_pkg, "kernelize", side_effect=spy_kernelize):
+            self.model.use_kernels = True
+            self.assertTrue(self.model.use_kernels)
+            self.assertEqual(call_count["n"], 1)
+            self.model.use_kernels = True
+            self.assertEqual(call_count["n"], 1)
+
+    def test_train_eval_calls_kernelize_with_correct_mode(self):
+        last_modes = []
+
+        def spy_kernelize(model, device=None, mode=None):
+            last_modes.append(mode)
+
+        with patch.object(kernels_pkg, "kernelize", side_effect=spy_kernelize):
+            self.model.use_kernels = True
+            self.model.train(True)
+            self.assertTrue(any(m == Mode.TRAINING for m in last_modes))
+            self.model.eval()
+            self.assertTrue(any(m == Mode.INFERENCE for m in last_modes))
+
+
+@require_kernels
+class TestKernelMappingDeviceFiltering(TestCasePlus):
+    """Test that kernel mappings correctly filter by current device."""
+
+    def test_multi_device_mapping_filters_correctly(self):
+        """
+        Test that when a kernel_mapping contains multiple devices (cuda, rocm),
+        only the current device's kernel is registered.
+        Regression test for issue where ROCm overwrote CUDA mapping.
+        """
+        kernel_mapping = {
+            "RMSNorm": {
+                "cuda": "kernels-community/layer_norm:LlamaRMSNorm",
+                "rocm": "kernels-community/layer_norm:LlamaRMSNorm",
+            }
+        }
+
+        kernel_config = KernelConfig(kernel_mapping)
+
+        # Create a mock model on CUDA device
+        mock_model = MagicMock()
+        mock_model.training = False
+
+        # Mock parameter with CUDA device
+        mock_param = MagicMock()
+        mock_param.device.type = "cuda"
+        mock_model.parameters.return_value = iter([mock_param])
+
+        # Mock named_modules with RMSNorm layer
+        mock_layer = MagicMock()
+        mock_layer.kernel_layer_name = "RMSNorm"
+        mock_model.named_modules.return_value = [("layers.0", mock_layer)]
+
+        # Trigger the mapping creation
+        kernel_config.create_compatible_mapping(mock_model)
+
+        # Verify results
+        result_mapping = kernel_config.kernel_mapping
+
+        self.assertIn("RMSNorm", result_mapping, "RMSNorm should be in mapping")
+        backends = list(result_mapping["RMSNorm"].keys())
+
+        # Assert only CUDA is present, not ROCm
+        self.assertIn("cuda", backends, "CUDA backend should be registered")
+        self.assertNotIn("rocm", backends, "ROCm backend should NOT be registered on CUDA device")
+
+    def test_single_device_mapping_still_works(self):
+        """
+        Test that single-device mappings continue to work as expected.
+        """
+        kernel_mapping = {"RMSNorm": "kernels-community/layer_norm:LlamaRMSNorm"}
+
+        kernel_config = KernelConfig(kernel_mapping)
+
+        # Create a mock model
+        mock_model = MagicMock()
+        mock_model.training = False
+
+        mock_param = MagicMock()
+        mock_param.device.type = "cuda"
+        mock_model.parameters.return_value = iter([mock_param])
+
+        mock_layer = MagicMock()
+        mock_layer.kernel_layer_name = "RMSNorm"
+        mock_model.named_modules.return_value = [("layers.0", mock_layer)]
+        kernel_config.create_compatible_mapping(mock_model)
+
+        result_mapping = kernel_config.kernel_mapping
+        self.assertIn("RMSNorm", result_mapping, "RMSNorm should be in mapping")
--- a/tests/models/init.py
+++ b/tests/models/init.py
--- a/tests/models/afmoe/init.py
+++ b/tests/models/afmoe/init.py
@@ -0,0 +1 @@
+
--- a/tests/models/afmoe/test_modeling_afmoe.py
+++ b/tests/models/afmoe/test_modeling_afmoe.py
@@ -0,0 +1,200 @@
+# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+
+from transformers import is_torch_available
+from transformers.testing_utils import cleanup, require_torch, require_torch_accelerator, slow, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import AfmoeForCausalLM, AfmoeModel, AutoTokenizer
+    from transformers.conversion_mapping import get_model_conversion_mapping
+
+from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
+
+
+class AfmoeModelTester(CausalLMModelTester):
+    if is_torch_available():
+        base_model_class = AfmoeModel
+
+    def __init__(
+        self,
+        parent,
+        batch_size=4,
+        seq_length=12,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=64,
+        hidden_size=32,
+        intermediate_size=16,
+        moe_intermediate_size=16,
+        num_hidden_layers=2,
+        num_dense_layers=1,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=128,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=False,
+        rope_theta=10000.0,
+        rope_parameters=None,
+        num_experts=4,
+        num_experts_per_tok=2,
+        num_shared_experts=2,
+        route_norm=True,
+        route_scale=1.0,
+        global_attn_every_n_layers=2,
+        sliding_window=128,
+        attention_dropout=0.0,
+    ):
+        super().__init__(
+            parent=parent,
+            batch_size=batch_size,
+            seq_length=seq_length,
+            is_training=is_training,
+            use_input_mask=use_input_mask,
+            use_token_type_ids=use_token_type_ids,
+            use_labels=use_labels,
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+        )
+        self.use_cache = use_cache
+        self.head_dim = head_dim
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_dense_layers = num_dense_layers
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_shared_experts = num_shared_experts
+        self.route_norm = route_norm
+        self.route_scale = route_scale
+        self.global_attn_every_n_layers = global_attn_every_n_layers
+        self.sliding_window = sliding_window
+        self.attention_dropout = attention_dropout
+
+
+@require_torch
+class AfmoeModelTest(CausalLMModelTest, unittest.TestCase):
+    model_tester_class = AfmoeModelTester
+    all_model_classes = (AfmoeModel, AfmoeForCausalLM) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": AfmoeModel, "text-generation": AfmoeForCausalLM} if is_torch_available() else {}
+    )
+
+    @unittest.skip("Afmoe applies key/query norm which doesn't work with packing")
+    def test_eager_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+    @unittest.skip("Afmoe  applies key/query norm which doesn't work with packing")
+    def test_sdpa_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+    @unittest.skip("Afmoe  applies key/query norm which doesn't work with packing")
+    def test_model_rope_scaling_frequencies(self):
+        pass
+
+    @unittest.skip("Afmoe has moe, output can be different")
+    def test_model_outputs_equivalence(self, **kwargs):
+        pass
+
+    def test_router_logits_without_aux_loss(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_dense_layers = 0
+        config.output_router_logits = True
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = AfmoeForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=attention_mask)
+        self.assertIsNotNone(result.router_logits)
+        self.assertEqual(result.router_logits[0].shape[-1], config.num_experts)
+        self.assertIsNone(result.aux_loss)
+
+    def test_moe_legacy_conversion_mapping_registered(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        model = AfmoeModel(config)
+        weight_mapping = get_model_conversion_mapping(model)
+        found_fused_expert_converter = any(
+            "mlp.experts.*.gate_proj.weight" in mapping.source_patterns
+            and "mlp.experts.gate_up_proj" in mapping.target_patterns
+            for mapping in weight_mapping
+        )
+        self.assertTrue(found_fused_expert_converter)
+
+
+@require_torch_accelerator
+@slow
+class AfmoeIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
+    @slow
+    @require_torch_accelerator
+    @pytest.mark.torch_compile_test
+    def test_compile_static_cache(self):
+        num_tokens_to_generate = 24
+        prompts = [
+            "Simply put, the theory of relativity states that ",
+            "My favorite all time favorite condiment is ketchup.",
+        ]
+        checkpoint = "arcee-ai/trinity-nano-preview"
+
+        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        model = AfmoeForCausalLM.from_pretrained(checkpoint, device_map=torch_device, dtype=torch.bfloat16)
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+
+        generated_ids = model.generate(**inputs, max_new_tokens=num_tokens_to_generate, do_sample=False)
+        dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=num_tokens_to_generate,
+            do_sample=False,
+            cache_implementation="static",
+        )
+        static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(dynamic_text, static_text)
+
+        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=num_tokens_to_generate,
+            do_sample=False,
+            cache_implementation="static",
+        )
+        static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(dynamic_text, static_compiled_text)
--- a/tests/models/aimv2/init.py
+++ b/tests/models/aimv2/init.py
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -0,0 +1,569 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch AIMv2 model."""
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import requests
+from parameterized import parameterized
+
+from transformers import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig
+from transformers.testing_utils import (
+    is_flaky,
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import (
+    is_torch_available,
+    is_vision_available,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    ModelTesterMixin,
+    _test_eager_matches_sdpa_inference,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        Aimv2Model,
+        Aimv2TextModel,
+        Aimv2VisionModel,
+    )
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor, AutoProcessor
+
+
+class Aimv2VisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=False,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return Aimv2VisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = Aimv2VisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+class Aimv2ModelTesterMixin(ModelTesterMixin):
+    """
+    Subclass of ModelTesterMixin with methods specific to testing Aimv2 models.
+    The SDPA equivalence test is overridden here because Aimv2 models may have test/vision/text+vision inputs,
+    different output logits, and are not supposed to be used or tested with padding_side="left".
+    """
+
+    def test_sdpa_can_dispatch_composite_models(self):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # Load the model with SDPA
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+
+                # Load model with eager attention
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+
+            if hasattr(model_sdpa, "vision_model"):
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
+                self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+
+            if hasattr(model_sdpa, "text_model"):
+                self.assertTrue(model_sdpa.text_model.config._attn_implementation == "sdpa")
+                self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
+
+            self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+            self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+
+@require_torch
+class Aimv2VisionModelTest(Aimv2ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Aimv2 does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (Aimv2VisionModel,) if is_torch_available() else ()
+
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = Aimv2VisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=Aimv2VisionConfig, has_text_modality=False, hidden_size=32
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Aimv2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+
+class Aimv2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=False,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=25,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return Aimv2TextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = Aimv2TextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Aimv2TextModelTest(Aimv2ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (Aimv2TextModel,) if is_torch_available() else ()
+
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = Aimv2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Aimv2TextConfig, hidden_size=32)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Aimv2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+
+class Aimv2ModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=False):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = Aimv2TextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = Aimv2VisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return Aimv2Config(
+            text_config=self.text_model_tester.get_config(),
+            vision_config=self.vision_model_tester.get_config(),
+            projection_dim=64,
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = Aimv2Model(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Aimv2ModelTest(Aimv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    additional_model_inputs = ["pixel_values"]
+    all_model_classes = (Aimv2Model,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": Aimv2Model, "image-feature-extraction": Aimv2VisionModel}
+        if is_torch_available()
+        else {}
+    )
+
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = Aimv2ModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=Aimv2Config, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        print(config_and_inputs)
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Aimv2Model does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip("Size mismatch on CUDA")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save Aimv2Config and check if we can load Aimv2VisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = Aimv2VisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save Aimv2Config and check if we can load Aimv2TextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = Aimv2TextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @is_flaky(
+        max_attempts=2,
+        description="sdpa gets nan values in some places while eager is fine. Except those places, the values are close",
+    )
+    def test_eager_matches_sdpa_inference(
+        self,
+        name,
+        dtype,
+        padding_side,
+        use_attention_mask,
+        output_attentions,
+        enable_kernels,
+    ):
+        "We need to relax a bit the `atols` for fp32 here due to the altup projections"
+        atols = {
+            ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.float16): 5e-3,
+            ("cpu", False, torch.bfloat16): 3e-2,  # this was relaxed
+            ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.float16): 5e-3,
+            ("cpu", True, torch.bfloat16): 3e-2,  # this was relaxed
+            ("cuda", False, torch.float32): 1e-6,
+            ("cuda", False, torch.bfloat16): 3e-2,  # this was relaxed
+            ("cuda", False, torch.float16): 5e-3,
+            ("cuda", True, torch.float32): 1e-6,
+            ("cuda", True, torch.bfloat16): 3e-2,  # this was relaxed
+            ("cuda", True, torch.float16): 5e-3,
+        }
+        _test_eager_matches_sdpa_inference(
+            self, name, dtype, padding_side, use_attention_mask, output_attentions, enable_kernels, atols=atols
+        )
+
+
+@require_vision
+@require_torch
+class Aimv2ModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "apple/aimv2-large-patch14-224-lit"
+        model = Aimv2Model.from_pretrained(model_name, device_map=torch_device)
+        processor = AutoProcessor.from_pretrained(model_name)
+
+        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+        ).to(model.device)
+
+        # Forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # Verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        # handle device
+        expected_logits = torch.tensor([[33.3550, 26.4255]]).to(model.device)
+        torch.testing.assert_close(outputs.logits_per_image, expected_logits, atol=1e-3, rtol=1e-3)
+
+
+@require_vision
+@require_torch
+class Aimv2VisionModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "apple/aimv2-large-patch14-224"
+
+        model = Aimv2VisionModel.from_pretrained(model_name, device_map=torch_device)
+        processor = AutoImageProcessor.from_pretrained(model_name)
+
+        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+        inputs = processor(image, return_tensors="pt").to(model.device)
+
+        with torch.no_grad():
+            output = model(**inputs)
+
+        # Verify logits shape
+        self.assertEqual(output.last_hidden_state.shape, torch.Size([1, 256, 1024]))
+
+        # Verify logits slice
+        # fmt: off
+        expected_logits = torch.tensor(
+        [[ 0.0510,  0.0806, -0.0990, -0.0154],
+        [ 2.7850, -2.5143, -0.3320,  2.4196],
+        [ 2.8179, -2.4089, -0.2770,  2.3218],
+        [ 2.7641, -2.4114, -0.3684,  2.2998],
+        [ 2.7972, -2.3180, -0.4490,  2.2302],
+        [ 2.8584, -2.5322, -0.2302,  2.4936],
+        [-2.7849,  2.4121,  1.3670, -1.5514]]).to(model.device)
+        # fmt: on
+
+        output_slice = output.last_hidden_state.squeeze(0)[0:7, 0:4]
+        self.assertTrue(torch.allclose(output_slice, expected_logits, atol=1e-3))
+
+    @slow
+    def test_inference_for_native_resolution(self):
+        model_name = "apple/aimv2-large-patch14-native"
+
+        model = Aimv2VisionModel.from_pretrained(model_name, device_map="auto")
+        processor = AutoImageProcessor.from_pretrained(model_name)
+
+        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+        inputs = processor(image, return_tensors="pt").to(model.device)
+
+        with torch.no_grad():
+            output = model(**inputs)
+
+        # Verify logits shape
+        self.assertEqual(output.last_hidden_state.shape, torch.Size([1, 1530, 1024]))
+
+        # Verify logits slice
+        # fmt: off
+        expected_logits = torch.tensor(
+        [[-1.3342,  0.3720,  0.0963,  0.4159],
+        [-1.5328,  0.4677,  0.0936,  0.4321],
+        [-0.3775, -0.2758, -0.0803, -0.5367],
+        [-1.3877,  0.5561, -1.9064, -1.1766],
+        [-0.5148,  0.0108, -0.4515, -0.6402],
+        [-0.3400, -0.1711, -0.1855, -0.4219],
+        [-1.2877, -0.0585, -0.1646,  0.7420]]).to(model.device)
+        # fmt: on
+
+        output_slice = output.last_hidden_state.squeeze(0)[0:7, 0:4]
+        self.assertTrue(torch.allclose(output_slice, expected_logits, atol=1e-3))
--- a/tests/models/albert/init.py
+++ b/tests/models/albert/init.py
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -0,0 +1,327 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import AlbertConfig, is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        AlbertForMaskedLM,
+        AlbertForMultipleChoice,
+        AlbertForPreTraining,
+        AlbertForQuestionAnswering,
+        AlbertForSequenceClassification,
+        AlbertForTokenClassification,
+        AlbertModel,
+    )
+
+
+class AlbertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=32,
+        embedding_size=8,
+        hidden_size=16,
+        num_hidden_layers=2,
+        # this needs to be the same as `num_hidden_layers`!
+        num_hidden_groups=2,
+        num_attention_heads=4,
+        intermediate_size=20,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=8,
+        type_vocab_size=2,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_hidden_groups = num_hidden_groups
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return AlbertConfig(
+            vocab_size=self.vocab_size,
+            embedding_size=self.embedding_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            num_hidden_groups=self.num_hidden_groups,
+            inner_group_num=1,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            sentence_order_label=sequence_labels,
+        )
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, config.num_labels))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = AlbertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = AlbertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class AlbertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            AlbertModel,
+            AlbertForPreTraining,
+            AlbertForMaskedLM,
+            AlbertForMultipleChoice,
+            AlbertForSequenceClassification,
+            AlbertForTokenClassification,
+            AlbertForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": AlbertModel,
+            "fill-mask": AlbertForMaskedLM,
+            "text-classification": AlbertForSequenceClassification,
+            "token-classification": AlbertForTokenClassification,
+            "zero-shot": AlbertForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["sentence_order_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = AlbertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=32)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "albert/albert-base-v1"
+        model = AlbertModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class AlbertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = AlbertModel.from_pretrained("albert/albert-base-v2")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        with torch.no_grad():
+            output = model(input_ids, attention_mask=attention_mask)[0]
+        expected_shape = torch.Size((1, 11, 768))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-0.6513, 1.5035, -0.2766], [-0.6515, 1.5046, -0.2780], [-0.6512, 1.5049, -0.2784]]]
+        )
+
+        torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4)
--- a/tests/models/albert/test_tokenization_albert.py
+++ b/tests/models/albert/test_tokenization_albert.py
@@ -0,0 +1,56 @@
+# Copyright 2019 Hugging Face inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import AlbertTokenizer
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers
+from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "albert/albert-base-v1"
+    tokenizer_class = AlbertTokenizer
+
+    # Integration test data - expected outputs for the default input string
+    integration_expected_tokens = ['▁this', '▁is', '▁a', '▁test', '▁', '😊', '▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁false', '.', '▁', '生活的真谛是', '▁hi', '▁hello', '▁hi', '▁hello', '▁hello', '▁', '<', 's', '>', '▁hi', '<', 's', '>', 'there', '▁the', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁hello', '.', '▁but', '▁i', 'rd', '▁and', '▁', 'ป', '▁i', 'rd', '▁', 'ด', '▁hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_token_ids = [48, 25, 21, 1289, 13, 1, 31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 4997, 9, 13, 1, 4148, 10975, 4148, 10975, 10975, 13, 1, 18, 1, 4148, 1, 18, 1, 1887, 14, 249, 3724, 378, 44, 7428, 13665, 45, 10975, 9, 47, 31, 897, 17, 13, 1, 31, 897, 13, 1, 8409, 184, 50, 42, 845]  # fmt: skip
+    integration_expected_decoded_text = "this is a test <unk> i was born in 92000, and this is false. <unk> hi hello hi hello hello <unk>s<unk> hi<unk>s<unk>there the following string should be properly encoded: hello. but ird and <unk> ird <unk> hey how are you doing"
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+        from_pretrained_id = "albert/albert-base-v1"
+
+        tokenizer = AlbertTokenizer.from_pretrained(from_pretrained_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.save_pretrained(cls.tmpdirname)
+
+        # Build backend for slow tokenizer from the fast tokenizer's SentencePiece model
+        vocab_file = getattr(tokenizer, "vocab_file", None)
+
+        extractor = SentencePieceExtractor(vocab_file)
+        vocab_ids, vocab_scores, merges = extractor.extract()
+        tokenizer_from_vocab = AlbertTokenizer(vocab=vocab_scores, merges=merges)
+        tokenizer_from_vocab.pad_token = tokenizer_from_vocab.eos_token
+
+        cls.tokenizers = [tokenizer, tokenizer_from_vocab]
--- a/tests/models/align/init.py
+++ b/tests/models/align/init.py
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -0,0 +1,549 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch ALIGN model."""
+
+import inspect
+import math
+import tempfile
+import unittest
+
+import requests
+
+from transformers import AlignConfig, AlignProcessor, AlignTextConfig, AlignVisionConfig
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AlignModel,
+        AlignTextModel,
+        AlignVisionModel,
+    )
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+class AlignVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=32,
+        num_channels=3,
+        depth_coefficient=3.1,
+        kernel_sizes=[3, 3, 5],
+        in_channels=[32, 16, 24],
+        out_channels=[16, 24, 30],
+        hidden_dim=64,
+        strides=[1, 1, 2],
+        num_block_repeats=[1, 1, 2],
+        expand_ratios=[1, 6, 6],
+        is_training=True,
+        hidden_act="gelu",
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.depth_coefficient = depth_coefficient
+        self.kernel_sizes = kernel_sizes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_dim = hidden_dim
+        self.strides = strides
+        self.num_block_repeats = num_block_repeats
+        self.expand_ratios = expand_ratios
+        self.is_training = is_training
+        self.hidden_act = hidden_act
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return AlignVisionConfig(
+            num_channels=self.num_channels,
+            depth_coefficient=self.depth_coefficient,
+            kernel_sizes=self.kernel_sizes,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            hidden_dim=self.hidden_dim,
+            strides=self.strides,
+            num_block_repeats=self.num_block_repeats,
+            expand_ratios=self.expand_ratios,
+            hidden_act=self.hidden_act,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = AlignVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+
+        patch_size = self.image_size // 4
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, config.hidden_dim, patch_size, patch_size)
+        )
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, config.hidden_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class AlignVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ALIGN does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (AlignVisionModel,) if is_torch_available() else ()
+
+    test_resize_embeddings = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = AlignVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=AlignVisionConfig,
+            has_text_modality=False,
+            hidden_size=32,
+            common_properties=["num_channels", "image_size"],
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="AlignVisionModel does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="AlignVisionModel does not use inputs_embeds")
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @unittest.skip(reason="AlignVisionModel does not support input and output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+            num_blocks = sum(config.num_block_repeats) * 4
+            self.assertEqual(len(hidden_states), num_blocks)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    @unittest.skip
+    def test_training(self):
+        pass
+
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "kakaobrain/align-base"
+        model = AlignVisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+class AlignTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask
+
+    def get_config(self):
+        return AlignTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, token_type_ids, input_mask):
+        model = AlignTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+            result = model(input_ids, token_type_ids=token_type_ids)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class AlignTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (AlignTextModel,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = AlignTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AlignTextConfig, hidden_size=32)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_true(self):
+        pass
+
+    @unittest.skip(reason="ALIGN does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Align does not use inputs_embeds")
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "kakaobrain/align-base"
+        model = AlignTextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+class AlignModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = AlignTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        test_config, input_ids, token_type_ids, input_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, pixel_values
+
+    def get_config(self):
+        return AlignConfig(
+            text_config=self.text_model_tester.get_config().to_dict(),
+            vision_config=self.vision_model_tester.get_config().to_dict(),
+            projection_dim=64,
+        )
+
+    def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
+        model = AlignModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask, token_type_ids)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, token_type_ids, input_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (AlignModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": AlignModel} if is_torch_available() else {}
+
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    has_attentions = False
+    skip_test_image_features_output_shape = True  # Align uses index -3 for hidden_size instead of -1
+
+    def setUp(self):
+        self.model_tester = AlignModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=AlignConfig,
+            has_text_modality=False,
+            common_properties=["projection_dim", "temperature_init_value"],
+        )
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
+        super().test_batching_equivalence(atol=atol, rtol=rtol)
+
+    @unittest.skip(reason="Start to fail after using torch `cu118`.")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Align does not use inputs_embeds")
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="AlignModel does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save AlignConfig and check if we can load AlignVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = AlignVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save AlignConfig and check if we can load AlignTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = AlignTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    def _image_features_get_expected_num_attentions(self, model_tester=None):
+        return sum(
+            math.ceil(self.model_tester.vision_model_tester.depth_coefficient * repeat)
+            for repeat in self.model_tester.vision_model_tester.num_block_repeats
+        )
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "kakaobrain/align-base"
+        model = AlignModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+@require_torch
+class AlignModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "kakaobrain/align-base"
+        model = AlignModel.from_pretrained(model_name).to(torch_device)
+        processor = AlignProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        texts = ["a photo of a cat", "a photo of a dog"]
+        inputs = processor(images=image, text=texts, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+        expected_logits = torch.tensor([[9.7093, 3.4679]], device=torch_device)
+        torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3)
--- a/tests/models/align/test_processing_align.py
+++ b/tests/models/align/test_processing_align.py
@@ -0,0 +1,69 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import AlignProcessor
+
+
+@require_vision
+class AlignProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = AlignProcessor
+
+    @classmethod
+    def _setup_tokenizer(cls):
+        tokenizer_class = cls._get_component_class_from_processor("tokenizer")
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        vocab_file = f"{cls.tmpdirname}/vocab.txt"
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write("\n".join(vocab_tokens))
+
+        tokenizer = tokenizer_class(vocab_file)
+        return tokenizer
+
+    @classmethod
+    def _setup_image_processor(cls):
+        image_processor_class = cls._get_component_class_from_processor("image_processor")
+
+        image_processor = image_processor_class(
+            do_resize=True,
+            size=20,
+            do_normalize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+        )
+        return image_processor
--- a/tests/models/altclip/init.py
+++ b/tests/models/altclip/init.py
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -0,0 +1,545 @@
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch AltCLIP model."""
+
+import inspect
+import unittest
+
+import numpy as np
+import requests
+from parameterized import parameterized
+
+from transformers import AltCLIPConfig, AltCLIPProcessor, AltCLIPTextConfig, AltCLIPVisionConfig
+from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+    from transformers import AltCLIPModel, AltCLIPTextModel, AltCLIPVisionModel
+
+if is_vision_available():
+    from PIL import Image
+
+
+class AltCLIPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return AltCLIPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = AltCLIPVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class AltCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (AltCLIPVisionModel,) if is_torch_available() else ()
+
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = AltCLIPVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=AltCLIPVisionConfig, has_text_modality=False, hidden_size=36
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="CLIP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_true(self):
+        pass
+
+    @unittest.skip(reason="AltCLIPVisionModel use the same cv backbone with CLIP model.")
+    def test_model_from_pretrained(self):
+        pass
+
+
+class AltCLIPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        project_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.project_dim = project_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return AltCLIPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            project_dim=self.project_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            pad_token_id=1,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = AltCLIPTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (AltCLIPTextModel,) if is_torch_available() else ()
+    # AltCLIPTextModel has large embeddings relative to model size, so we need higher split percentages
+    model_split_percents = [0.5, 0.8, 0.9]
+
+    # TODO (@SunMarc): Fix me
+    @unittest.skip(reason="It's broken.")
+    def test_resize_tokens_embeddings(self):
+        super().test_resize_tokens_embeddings()
+
+    def setUp(self):
+        self.model_tester = AltCLIPTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AltCLIPTextConfig, hidden_size=32)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="This module does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_true(self):
+        pass
+
+    def test_model_outputs_equivalence(self):
+        pass
+
+    @unittest.skip(reason="Result of the model is a dict")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="AltCLIP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "BAAI/AltCLIP"
+        model = AltCLIPTextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+class AltCLIPModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return AltCLIPConfig(
+            text_config=self.text_model_tester.get_config().to_dict(),
+            vision_config=self.vision_model_tester.get_config().to_dict(),
+            projection_dim=64,
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = AltCLIPModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            model(input_ids, pixel_values, attention_mask)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_torch
+class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (AltCLIPModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": AltCLIPModel} if is_torch_available() else {}
+
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    additional_model_inputs = ["pixel_values"]
+
+    # TODO: Fix the failed tests when this model gets more usage
+    def is_pipeline_test_to_skip(
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
+    ):
+        if pipeline_test_case_name == "FeatureExtractionPipelineTests":
+            return True
+
+        return False
+
+    def setUp(self):
+        self.model_tester = AltCLIPModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=AltCLIPConfig,
+            has_text_modality=False,
+            common_properties=["projection_dim", "logit_scale_init_value"],
+        )
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, *args):
+        # adding only flaky decorator here and call the parent test method
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="CLIPModel does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "BAAI/AltCLIP"
+        model = AltCLIPModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+@require_vision
+@require_torch
+class AltCLIPModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "BAAI/AltCLIP"
+        model = AltCLIPModel.from_pretrained(model_name).to(torch_device)
+        processor = AltCLIPProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        inputs = processor(text=["一张猫的照片", "一张狗的照片"], images=image, padding=True, return_tensors="pt").to(torch_device)  # fmt: skip
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        probs = outputs.logits_per_image.softmax(dim=1)
+        expected_probs = torch.tensor([[9.9942e-01, 5.7805e-04]], device=torch_device)
+
+        torch.testing.assert_close(probs, expected_probs, rtol=5e-3, atol=5e-3)
+
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+        model_name = "BAAI/AltCLIP"
+        model = AltCLIPModel.from_pretrained(model_name).to(torch_device)
+
+        image_processor = AltCLIPProcessor.from_pretrained(
+            model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180}
+        )
+
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
+
+        # interpolate_pos_encodiung false should return value error
+        with self.assertRaises(ValueError, msg="doesn't match model"):
+            with torch.no_grad():
+                model(**inputs, interpolate_pos_encoding=False)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs, interpolate_pos_encoding=True)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 145, 1024))
+        print("nilesh ")
+        print(outputs.vision_model_output.last_hidden_state.shape)
+        print(outputs.vision_model_output.last_hidden_state[0, :3, :3])
+
+        self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [
+                [-0.3577, -0.5977, 0.3555],
+                [0.4544, 0.1660, 0.6583],
+                [1.1715, -0.4870, 0.1645],
+            ]
+        ).to(torch_device)
+
+        torch.testing.assert_close(
+            outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4
+        )
--- a/tests/models/altclip/test_processing_altclip.py
+++ b/tests/models/altclip/test_processing_altclip.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import AltCLIPProcessor
+from transformers.testing_utils import require_vision
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+@require_vision
+class AltClipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = AltCLIPProcessor
+    model_id = "BAAI/AltCLIP"
--- a/tests/models/apertus/init.py
+++ b/tests/models/apertus/init.py
--- a/tests/models/apertus/test_modeling_apertus.py
+++ b/tests/models/apertus/test_modeling_apertus.py
@@ -0,0 +1,66 @@
+# Copyright 2025 The HuggingFace Inc. team and the Swiss AI Initiative. All rights reserved.
+#
+# This code is based on HuggingFace's LLaMA implementation in this library.
+# It has been modified from its original forms to accommodate minor architectural
+# differences compared to LLaMA used by the Swiss AI Initiative that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Apertus model."""
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    require_torch,
+    require_torch_accelerator,
+    slow,
+)
+
+from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
+
+
+if is_torch_available():
+    from transformers import (
+        ApertusForCausalLM,
+        ApertusModel,
+    )
+
+
+class ApertusModelTester(CausalLMModelTester):
+    if is_torch_available():
+        base_model_class = ApertusModel
+
+    def __init__(self, parent):
+        super().__init__(parent=parent)
+        # NOTE(3outeille): must be 0.0 for TP backward tests. In train mode, non-zero dropout causes
+        # different RNG states between the non-TP and TP model forward passes (they run sequentially),
+        # leading to different dropout masks and mismatched losses.
+        self.attention_probs_dropout_prob = 0.0
+
+
+@require_torch
+class ApertusModelTest(CausalLMModelTest, unittest.TestCase):
+    model_tester_class = ApertusModelTester
+
+    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
+    # This is because we are hitting edge cases with the causal_mask buffer
+    model_split_percents = [0.5, 0.7, 0.8]
+
+    # used in `test_torch_compile_for_training`
+    _torch_compile_train_cls = ApertusForCausalLM if is_torch_available() else None
+
+
+@require_torch_accelerator
+@slow
+class ApertusIntegrationTest(unittest.TestCase):
+    pass
--- a/tests/models/arcee/init.py
+++ b/tests/models/arcee/init.py
--- a/tests/models/arcee/test_modeling_arcee.py
+++ b/tests/models/arcee/test_modeling_arcee.py
@@ -0,0 +1,124 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Arcee model."""
+
+import unittest
+
+from pytest import mark
+
+from transformers import AutoTokenizer, is_torch_available
+from transformers.testing_utils import (
+    require_flash_attn,
+    require_torch,
+    require_torch_accelerator,
+    slow,
+)
+
+from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        ArceeConfig,
+        ArceeForCausalLM,
+        ArceeModel,
+    )
+
+
+class ArceeModelTester(CausalLMModelTester):
+    if is_torch_available():
+        base_model_class = ArceeModel
+
+
+@require_torch
+class ArceeModelTest(CausalLMModelTest, unittest.TestCase):
+    model_tester_class = ArceeModelTester
+
+    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
+    # This is because we are hitting edge cases with the causal_mask buffer
+    model_split_percents = [0.5, 0.7, 0.8]
+
+    # used in `test_torch_compile_for_training`
+    _torch_compile_train_cls = ArceeForCausalLM if is_torch_available() else None
+
+    def test_arcee_mlp_uses_relu_squared(self):
+        """Test that ArceeMLP uses ReLU² activation instead of SiLU."""
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        config.hidden_act = "relu2"  # Ensure we're using relu2 activation
+        model = ArceeModel(config)
+
+        # Check that the MLP layers use the correct activation
+        mlp = model.layers[0].mlp
+        # Test with a simple input
+        x = torch.randn(1, 10, config.hidden_size)
+        up_output = mlp.up_proj(x)
+
+        # Verify ReLU² activation: x * relu(x)
+        expected_activation = up_output * torch.relu(up_output)
+        actual_activation = mlp.act_fn(up_output)
+
+        self.assertTrue(torch.allclose(expected_activation, actual_activation, atol=1e-5))
+
+
+@require_torch_accelerator
+class ArceeIntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        import gc
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @slow
+    def test_model_from_pretrained(self):
+        # This test would be enabled once a pretrained model is available
+        # For now, we just test that the model can be instantiated
+        config = ArceeConfig()
+        model = ArceeForCausalLM(config)
+        self.assertIsInstance(model, ArceeForCausalLM)
+
+    @mark.skip(reason="Model is not currently public - will update test post release")
+    @slow
+    def test_model_generation(self):
+        EXPECTED_TEXT_COMPLETION = (
+            """Once upon a time,In a village there was a farmer who had three sons. The farmer was very old and he"""
+        )
+        prompt = "Once upon a time"
+        tokenizer = AutoTokenizer.from_pretrained("arcee-ai/model-id")
+        model = ArceeForCausalLM.from_pretrained("arcee-ai/model-id", device_map="auto")
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        generated_ids = model.generate(input_ids, max_new_tokens=20)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @mark.skip(reason="Model is not currently public - will update test post release")
+    @slow
+    @require_flash_attn
+    @mark.flash_attn_test
+    def test_model_generation_flash_attn(self):
+        EXPECTED_TEXT_COMPLETION = (
+            " the food, the people, and the overall experience. I would definitely recommend this place to others."
+        )
+        prompt = "This is a nice place. " * 1024 + "I really enjoy the scenery,"
+        tokenizer = AutoTokenizer.from_pretrained("arcee-ai/model-id")
+        model = ArceeForCausalLM.from_pretrained(
+            "arcee-ai/model-id", device_map="auto", attn_implementation="flash_attention_2", dtype="auto"
+        )
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        generated_ids = model.generate(input_ids, max_new_tokens=20)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text[len(prompt) :])
--- a/tests/models/aria/init.py
+++ b/tests/models/aria/init.py
--- a/tests/models/aria/test_image_processing_aria.py
+++ b/tests/models/aria/test_image_processing_aria.py
@@ -0,0 +1,304 @@
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.image_utils import PILImageResampling
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+if is_torch_available():
+    import torch
+
+
+class AriaImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_images=1,
+        min_resolution=30,
+        max_resolution=40,
+        size=None,
+        max_image_size=980,
+        min_image_size=336,
+        split_resolutions=None,
+        split_image=True,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_convert_rgb=True,
+        resample=PILImageResampling.BICUBIC,
+    ):
+        self.size = size if size is not None else {"longest_edge": max_resolution}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_images = num_images
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.resample = resample
+        self.max_image_size = max_image_size
+        self.min_image_size = min_image_size
+        self.split_resolutions = split_resolutions if split_resolutions is not None else [[980, 980]]
+        self.split_image = split_image
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "max_image_size": self.max_image_size,
+            "min_image_size": self.min_image_size,
+            "split_resolutions": self.split_resolutions,
+            "split_image": self.split_image,
+            "do_convert_rgb": self.do_convert_rgb,
+            "do_normalize": self.do_normalize,
+            "resample": self.resample,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to AriaImageProcessor,
+        assuming do_resize is set to True. The expected size in that case the max image size.
+        """
+        return self.max_image_size, self.max_image_size
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(
+        self,
+        batch_size=None,
+        min_resolution=None,
+        max_resolution=None,
+        num_channels=None,
+        num_images=None,
+        size_divisor=None,
+        equal_resolution=False,
+        numpify=False,
+        torchify=False,
+    ):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+
+        One can specify whether the images are of the same resolution or not.
+        """
+        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+        batch_size = batch_size if batch_size is not None else self.batch_size
+        min_resolution = min_resolution if min_resolution is not None else self.min_resolution
+        max_resolution = max_resolution if max_resolution is not None else self.max_resolution
+        num_channels = num_channels if num_channels is not None else self.num_channels
+        num_images = num_images if num_images is not None else self.num_images
+
+        images_list = []
+        for i in range(batch_size):
+            images = []
+            for j in range(num_images):
+                if equal_resolution:
+                    width = height = max_resolution
+                else:
+                    # To avoid getting image width/height 0
+                    if size_divisor is not None:
+                        # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
+                        min_resolution = max(size_divisor, min_resolution)
+                    width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
+                images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
+            images_list.append(images)
+
+        if not numpify and not torchify:
+            # PIL expects the channel dimension as last dimension
+            images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
+
+        if torchify:
+            images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
+
+        if numpify:
+            # Numpy images are typically in channels last format
+            images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list]
+
+        return images_list
+
+
+@require_torch
+@require_vision
+class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = AriaImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+            self.assertTrue(hasattr(image_processing, "max_image_size"))
+            self.assertTrue(hasattr(image_processing, "min_image_size"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "split_image"))
+
+    def test_call_numpy(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for sample_images in image_inputs:
+                for image in sample_images:
+                    self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_numpy_4_channels(self):
+        # Aria always processes images as RGB, so it always returns images with 3 channels
+        for image_processing_class in self.image_processing_classes.values():
+            image_processor_dict = self.image_processor_dict
+            image_processing = image_processing_class(**image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+
+            for sample_images in image_inputs:
+                for image in sample_images:
+                    self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_pil(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for images in image_inputs:
+                for image in images:
+                    self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_pytorch(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PyTorch tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+            for images in image_inputs:
+                for image in images:
+                    self.assertIsInstance(image, torch.Tensor)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            self.assertEqual(
+                tuple(encoded_images.shape),
+                (self.image_processor_tester.batch_size, *expected_output_image_shape),
+            )
+
+    def test_pad_for_patching(self):
+        for backend_name, image_processing_class in self.image_processing_classes.items():
+            numpify = backend_name == "pil"
+            torchify = backend_name == "torchvision"
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # Create odd-sized images
+            image_input = self.image_processor_tester.prepare_image_inputs(
+                batch_size=1,
+                max_resolution=400,
+                num_images=1,
+                equal_resolution=True,
+                numpify=numpify,
+                torchify=torchify,
+            )[0][0]
+            self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
+
+            # Both backends use channels-first internally; transpose if numpify returned HWC
+            if numpify:
+                image_input = image_input.transpose(2, 0, 1)
+
+            # Test odd-width
+            image_shape = (400, 601)
+            encoded_images = image_processing._pad_for_patching(image_input, image_shape)
+            self.assertEqual(encoded_images.shape[-2:], image_shape)
+
+            # Test odd-height
+            image_shape = (503, 400)
+            encoded_images = image_processing._pad_for_patching(image_input, image_shape)
+            self.assertEqual(encoded_images.shape[-2:], image_shape)
+
+    def test_get_num_patches_without_images(self):
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**self.image_processor_dict)
+            num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
+            self.assertEqual(num_patches, 1)
+
+            num_patches = image_processing.get_number_of_image_patches(
+                height=300, width=500, images_kwargs={"split_image": True}
+            )
+            self.assertEqual(num_patches, 1)
+
+            num_patches = image_processing.get_number_of_image_patches(
+                height=100, width=100, images_kwargs={"split_image": True, "max_image_size": 200}
+            )
+            self.assertEqual(num_patches, 19)
--- a/tests/models/aria/test_modeling_aria.py
+++ b/tests/models/aria/test_modeling_aria.py
@@ -0,0 +1,522 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Aria model."""
+
+import unittest
+
+import pytest
+import requests
+
+from transformers import (
+    AriaConfig,
+    AriaForConditionalGeneration,
+    AriaModel,
+    AriaTextConfig,
+    AutoProcessor,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.models.idefics3 import Idefics3VisionConfig
+from transformers.testing_utils import (
+    Expectations,
+    cleanup,
+    require_bitsandbytes,
+    require_torch,
+    require_torch_large_accelerator,
+    require_vision,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from PIL import Image
+
+# Used to be https://aria-vl.github.io/static/images/view.jpg but it was removed, llava-vl has the same image
+IMAGE_OF_VIEW_URL = "https://llava-vl.github.io/static/images/view.jpg"
+
+
+class AriaVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        num_channels=3,
+        image_size=16,
+        num_image_tokens=4,
+        ignore_index=-100,
+        image_token_index=9,
+        projector_hidden_act="gelu",
+        seq_length=7,
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-1,
+        text_config=AriaTextConfig(
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=False,
+            use_labels=True,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            pad_token_id=1,
+            hidden_size=32,
+            intermediate_size=16,
+            max_position_embeddings=60,
+            model_type="aria_moe_lm",
+            moe_intermediate_size=4,
+            moe_num_experts=3,
+            moe_topk=2,
+            num_attention_heads=2,
+            num_experts_per_tok=3,
+            num_hidden_layers=2,
+            num_key_value_heads=2,
+            rope_theta=5000000,
+            vocab_size=99,
+            eos_token_id=2,
+            head_dim=4,
+        ),
+        is_training=True,
+        vision_config=Idefics3VisionConfig(
+            image_size=16,
+            patch_size=8,
+            num_channels=3,
+            is_training=True,
+            hidden_size=32,
+            projection_dim=4,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            intermediate_size=4,
+            dropout=0.1,
+            attention_dropout=0.1,
+            initializer_range=0.02,
+        ),
+    ):
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.pad_token_id = text_config.pad_token_id
+        self.eos_token_id = text_config.eos_token_id
+        self.num_hidden_layers = text_config.num_hidden_layers
+        self.vocab_size = text_config.vocab_size
+        self.hidden_size = text_config.hidden_size
+        self.num_attention_heads = text_config.num_attention_heads
+        self.is_training = is_training
+
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.num_image_tokens = num_image_tokens
+        self.seq_length = seq_length + self.num_image_tokens
+        self.projector_patch_to_query_dict = {
+            vision_config.image_size**2 // vision_config.patch_size**2: vision_config.projection_dim
+        }
+
+    def get_config(self):
+        return AriaConfig(
+            text_config=self.text_config.to_dict(),
+            vision_config=self.vision_config.to_dict(),
+            ignore_index=self.ignore_index,
+            image_token_index=self.image_token_index,
+            projector_hidden_act=self.projector_hidden_act,
+            vision_feature_select_strategy=self.vision_feature_select_strategy,
+            vision_feature_layer=self.vision_feature_layer,
+            eos_token_id=self.eos_token_id,
+            projector_patch_to_query_dict=self.projector_patch_to_query_dict,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.vision_config.num_channels,
+                self.vision_config.image_size,
+                self.vision_config.image_size,
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+        attention_mask = input_ids.ne(1).to(torch_device)
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_index
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    """
+    Model tester for `AriaForConditionalGeneration`.
+    """
+
+    all_model_classes = (AriaModel, AriaForConditionalGeneration) if is_torch_available() else ()
+
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = AriaVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AriaConfig, has_text_modality=False)
+
+    @pytest.mark.xfail(
+        reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm"
+    )
+    def test_training_gradient_checkpointing(self):
+        super().test_training_gradient_checkpointing()
+
+    @pytest.mark.xfail(
+        reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        super().test_training_gradient_checkpointing_use_reentrant_false()
+
+    @pytest.mark.xfail(
+        reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_true(self):
+        super().test_training_gradient_checkpointing_use_reentrant_true()
+
+
+SKIP = False
+torch_accelerator_module = getattr(torch, torch_device)
+memory = 23  # skip on T4 / A10
+if hasattr(torch_accelerator_module, "get_device_properties"):
+    if torch_accelerator_module.get_device_properties(0).total_memory / 1024**3 < memory:
+        SKIP = True
+
+
+@unittest.skipIf(SKIP, reason="A10 doesn't have enough GPU memory for this tests")
+@require_torch
+@slow
+class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
+        cleanup(torch_device, gc_collect=True)
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @require_torch_large_accelerator
+    @require_bitsandbytes
+    def test_small_model_integration_test(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model = AriaForConditionalGeneration.from_pretrained(
+            "rhymes-ai/Aria",
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
+        )
+
+        prompt = "<|img|>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
+        raw_image = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
+        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device, model.dtype)
+
+        non_img_tokens = [
+            109, 3905, 2000, 93415, 4551, 1162, 901, 3894, 970, 2478, 1017, 19312, 2388, 1596, 1809, 970, 5449, 1235,
+            3333, 93483, 109, 61081, 11984, 14800, 93415
+        ]  # fmt: skip
+        EXPECTED_INPUT_IDS = torch.tensor([[9] * 256 + non_img_tokens]).to(inputs["input_ids"].device)
+        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
+
+        output = model.generate(**inputs, max_new_tokens=20)
+        decoded_output = self.processor.decode(output[0], skip_special_tokens=True)
+
+        expected_output = Expectations(
+            {
+                (
+                    "cuda",
+                    None,
+                ): "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,",
+                (
+                    "rocm",
+                    (9, 5),
+                ): "\n USER: What are the things I should be cautious about when I visit this place?\n ASSISTANT: When you visit this place, you should be cautious about the following things:\n\n- The",
+            }
+        ).get_expectation()
+        self.assertEqual(decoded_output, expected_output)
+
+    @require_torch_large_accelerator
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_single(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model_id = "rhymes-ai/Aria"
+
+        model = AriaForConditionalGeneration.from_pretrained(
+            model_id,
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <|img|>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
+        raw_image = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device, model.dtype)
+
+        output = model.generate(**inputs, max_new_tokens=90, do_sample=False)
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", (8, 0)): "USER: \n What are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this beautiful location, it's important to be mindful of a few things to ensure both your safety and the preservation of the environment. Firstly, always be cautious when walking on the wooden pier, as it can be slippery, especially during or after rain. Secondly, be aware of the local wildlife and do not feed or disturb them. Lastly, respect the natural surroundings by not littering and sticking to",
+                ("rocm", (9, 5)): "USER: \n What are the things I should be cautious about when I visit this place? ASSISTANT: \n\nWhen visiting this place, you should be cautious about the following:\n\n1. **Weather Conditions**: The weather can be unpredictable, so it's important to check the forecast and dress in layers. Sudden changes in weather can occur, so be prepared for rain or cold temperatures.\n\n2. **Safety on the Dock**: The dock may be slippery, especially when",
+            }
+        ).get_expectation()  # fmt: off
+
+        decoded_output = processor.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        self.assertEqual(
+            decoded_output,
+            EXPECTED_DECODED_TEXT,
+            f"Expected: {repr(EXPECTED_DECODED_TEXT)}\nActual: {repr(decoded_output)}",
+        )
+
+    @require_torch_large_accelerator
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_batched(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model_id = "rhymes-ai/Aria"
+
+        model = AriaForConditionalGeneration.from_pretrained(
+            model_id,
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompts = [
+            "USER: <|img|>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:",
+            "USER: <|img|>\nWhat is this? ASSISTANT:",
+        ]
+        image1 = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
+            model.device, model.dtype
+        )
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): [
+                    "USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you",
+                    "USER:  \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on",
+                ],
+                ("rocm", (9, 5)): [
+                    "USER: \n What are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: \n\nWhen visiting this place, you should be cautious about the weather conditions, as it",
+                    "USER: \n What is this? ASSISTANT: This is a picture of two cats sleeping on a couch. USER: What is the color of",
+                ],
+            }
+        ).get_expectation()
+
+        decoded_output = processor.batch_decode(output, skip_special_tokens=True)
+        self.assertEqual(decoded_output, EXPECTED_DECODED_TEXT)
+
+    @require_torch_large_accelerator
+    @require_bitsandbytes
+    def test_small_model_integration_test_batch(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model = AriaForConditionalGeneration.from_pretrained(
+            "rhymes-ai/Aria",
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
+        )
+        # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
+        prompts = [
+            "USER: <|img|>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <|img|>\nWhat is this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
+            model.device, model.dtype
+        )
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = Expectations({
+            ("cuda", None): [
+                'USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.',
+                'USER:  \nWhat is this?\nASSISTANT: Cats',
+            ],
+            ("rocm", (9, 5)): [
+                'USER: \n What are the things I should be cautious about when I visit this place? What should I bring with me?\n ASSISTANT: \n\nWhen visiting this place, you should be cautious about the following:\n\n-',
+                'USER: \n What is this?\n ASSISTANT: This is a picture of two cats sleeping on a couch. The couch is red, and the cats',
+            ],
+        }).get_expectation()  # fmt: skip
+
+        decoded_output = self.processor.batch_decode(output, skip_special_tokens=True)
+        self.assertEqual(decoded_output, EXPECTED_DECODED_TEXT)
+
+    @require_torch_large_accelerator
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_batched_regression(self):
+        # Let's make sure we test the preprocessing to replace what is used
+        model_id = "rhymes-ai/Aria"
+
+        # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
+        model = AriaForConditionalGeneration.from_pretrained(
+            model_id,
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
+        )
+        processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")
+
+        prompts = [
+            "USER: <|img|>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <|img|>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <|img|>\nAnd this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True)
+        inputs = inputs.to(model.device, model.dtype)
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = Expectations({
+            ("cuda", None): ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER:  \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER:  \nAnd this?\nASSISTANT: A cat sleeping on a bed.'],
+            ("rocm", (9, 5)): ['USER: \n What are the things I should be cautious about when I visit this place? What should I bring with me?\n ASSISTANT: \n\nWhen visiting this place, you should be cautious about the weather conditions, as it', 'USER: \n What is this?\n ASSISTANT: Two cats lying on a bed!\n USER: \n And this?\n ASSISTANT: A serene lake scene with a wooden dock extending into the water.\n USER: \n']
+        }).get_expectation()  # fmt: skip
+
+        decoded_output = processor.batch_decode(output, skip_special_tokens=True)
+        self.assertEqual(decoded_output, EXPECTED_DECODED_TEXT)
+
+    @require_torch_large_accelerator
+    @require_vision
+    @require_bitsandbytes
+    def test_batched_generation(self):
+        # Skip multihead_attn for 4bit because MHA will read the original weight without dequantize.
+        # See https://github.com/huggingface/transformers/pull/37444#discussion_r2045852538.
+        model = AriaForConditionalGeneration.from_pretrained(
+            "rhymes-ai/Aria",
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
+        )
+        processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
+
+        prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
+        prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
+        prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
+        url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
+        url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
+        image1 = Image.open(requests.get(url1, stream=True).raw)
+        image2 = Image.open(requests.get(url2, stream=True).raw)
+
+        # Create inputs
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": prompt1},
+                    {"type": "image"},
+                    {"type": "text", "text": prompt2},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": prompt3},
+                ],
+            },
+        ]
+
+        prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
+        images = [[image1, image2], [image2]]
+        inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(
+            device=model.device, dtype=model.dtype
+        )
+
+        EXPECTED_OUTPUTS = Expectations(
+            {
+                ("cpu", None): [
+                    "<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n <image>\n USER: What's the difference of two images?\n ASSISTANT:<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with",
+                    "<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a grassy hill. The alpaca has",
+                ],
+                ("cuda", None): [
+                    "<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n <image>\n USER: What's the difference of two images?\n ASSISTANT:<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with",
+                    "<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a patch of ground with some dry grass. The",
+                ],
+                ("xpu", 3): [
+                    "<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n <image>\n USER: What's the difference of two images?\n ASSISTANT:<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with",
+                    "<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a patch of ground with some dry grass. The",
+                ],
+                ("rocm", (9, 5)): [
+                    "<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n <image>\n USER: What's the difference of two images?\n ASSISTANT:<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image shows a cute golden retriever puppy sitting on a paved surface with a stick",
+                    '<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young llama standing on a patch of ground with some dry grass and dirt. The'
+                ],
+            }
+        )  # fmt: skip
+        EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
+        generate_ids = model.generate(**inputs, max_new_tokens=20)
+        outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        self.assertListEqual(outputs, EXPECTED_OUTPUT)
+
+    def test_tokenizer_integration(self):
+        model_id = "rhymes-ai/Aria"
+        slow_tokenizer = AutoTokenizer.from_pretrained(
+            model_id, bos_token="<|startoftext|>", eos_token="<|endoftext|>", use_fast=False
+        )
+        slow_tokenizer.add_tokens("<image>", True)
+
+        fast_tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            from_slow=True,
+            legacy=False,
+        )
+        fast_tokenizer.add_tokens("<image>", True)
+
+        prompt = "<|startoftext|><|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>"
+        EXPECTED_OUTPUT = ['<|startoftext|>', '<', '|', 'im', '_', 'start', '|', '>', 'system', '\n', 'Answer', '▁the', '▁questions', '.<', '|', 'im', '_', 'end', '|', '><', '|', 'im', '_', 'start', '|', '>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<', '|', 'im', '_', 'end', '|', '>']  # fmt: skip
+        self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+        self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+
+    @require_torch_large_accelerator
+    @require_bitsandbytes
+    def test_generation_no_images(self):
+        model_id = "rhymes-ai/Aria"
+        model = AriaForConditionalGeneration.from_pretrained(
+            model_id,
+            quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+        # Prepare inputs with no images
+        inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
+
+        # Make sure that `generate` works
+        _ = model.generate(**inputs, max_new_tokens=20)
--- a/tests/models/aria/test_processing_aria.py
+++ b/tests/models/aria/test_processing_aria.py
@@ -0,0 +1,297 @@
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import AriaProcessor
+from transformers.image_utils import load_image
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
+
+
+@require_torch
+@require_vision
+class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    # NOTE: setUpClass, tearDownClass, and getter methods have been removed.
+    # They are now automatically handled by ProcessorTesterMixin.
+    # This test only needs: processor_class = YourProcessor
+    # Optionally: model_id = "some/model" to load from specific pretrained model
+    # Optionally: prepare_processor_dict() for custom processor kwargs.
+
+    processor_class = AriaProcessor
+    model_id = "m-ric/Aria_hf_2"
+
+    @classmethod
+    def _setup_test_attributes(cls, processor):
+        cls.image1 = load_image(
+            url_to_local_path(
+                "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+            )
+        )
+        cls.image2 = load_image(
+            url_to_local_path("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
+        )
+        cls.image3 = load_image(
+            url_to_local_path(
+                "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
+            )
+        )
+        cls.bos_token = "<|im_start|>"
+        cls.eos_token = "<|im_end|>"
+
+        cls.image_token = processor.tokenizer.image_token
+        cls.fake_image_token = "o"
+        cls.global_img_token = "<|img|>"
+
+        cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
+        cls.eos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.eos_token)
+
+        cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
+        cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
+        cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
+        cls.padding_token_id = processor.tokenizer.pad_token_id
+        cls.image_seq_len = 2
+
+    @staticmethod
+    def prepare_processor_dict():
+        return {
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<fim_prefix><|img|><fim_suffix>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+            "size_conversion": {490: 2, 980: 2},
+        }  # fmt: skip
+
+    def test_get_num_vision_tokens(self):
+        "Tests general functionality of the helper used internally in vLLM"
+
+        processor = self.get_processor()
+
+        output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
+        self.assertTrue("num_image_tokens" in output)
+        self.assertEqual(len(output["num_image_tokens"]), 3)
+
+        self.assertTrue("num_image_patches" in output)
+        self.assertEqual(len(output["num_image_patches"]), 3)
+
+    def test_process_interleaved_images_prompts_image_splitting(self):
+        processor = self.get_processor()
+        processor.image_processor.split_image = True
+
+        # Test that a single image is processed correctly
+        inputs = processor(images=self.image1, text="Ok<|img|>", images_kwargs={"split_image": True})
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 3, 980, 980))
+        self.assertEqual(np.array(inputs["pixel_mask"]).shape, (2, 980, 980))
+
+    def test_process_interleaved_images_prompts_no_image_splitting(self):
+        processor = self.get_processor()
+        processor.image_processor.split_image = False
+
+        # Test that a single image is processed correctly
+        inputs = processor(images=self.image1, text="Ok<|img|>")
+        image1_expected_size = (980, 980)
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 3, *image1_expected_size))
+        self.assertEqual(np.array(inputs["pixel_mask"]).shape, (1, *image1_expected_size))
+        # fmt: on
+
+        # Test a single sample with image and text
+        image_str = "<|img|>"
+        text_str = "In this image, we see"
+        text = image_str + text_str
+        inputs = processor(text=text, images=self.image1)
+
+        # fmt: off
+        # The processor expands <|img|> to <|img|><|img|> (image_seq_len=2) before tokenization
+        # So we need to tokenize the full expanded string to match what the processor does
+        expanded_text = self.image_token * self.image_seq_len + text_str
+
+        expected_input_ids = [processor.tokenizer(expanded_text, add_special_tokens=False)["input_ids"]]
+        # self.assertEqual(len(inputs["input_ids"]), len(expected_input_ids))
+
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 3, *image1_expected_size))
+        self.assertEqual(np.array(inputs["pixel_mask"]).shape, (1, *image1_expected_size))
+        # fmt: on
+
+        # Test that batch is correctly processed
+        image_str = "<|img|>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "In this image, we see"
+
+        text = [
+            image_str + text_str_1,
+            image_str + image_str + text_str_2,
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+
+        inputs = processor(text=text, images=images, padding=True)
+
+        # fmt: off
+        tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
+
+        image_tokens = [self.image_token_id] * self.image_seq_len
+        expected_input_ids_1 = image_tokens + tokenized_sentence_1["input_ids"]
+        expected_input_ids_2 = 2 * image_tokens + tokenized_sentence_2["input_ids"]
+
+        # Pad the first input to match the second input
+        pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
+
+        expected_attention_mask = [ [0] * pad_len + [1] * len(expected_input_ids_1), [1] * (len(expected_input_ids_2))]
+
+        self.assertEqual(
+            inputs["attention_mask"],
+            expected_attention_mask
+        )
+        self.assertEqual(np.array(inputs['pixel_values']).shape, (3, 3, 980, 980))
+        self.assertEqual(np.array(inputs['pixel_mask']).shape, (3, 980, 980))
+        # fmt: on
+
+    def test_non_nested_images_with_batched_text(self):
+        processor = self.get_processor()
+        processor.image_processor.do_image_splitting = False
+
+        image_str = "<|img|>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "In this image, we see"
+
+        text = [
+            image_str + text_str_1,
+            image_str + image_str + text_str_2,
+        ]
+        images = [self.image1, self.image2, self.image3]
+
+        inputs = processor(text=text, images=images, padding=True)
+
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (3, 3, 980, 980))
+        self.assertEqual(np.array(inputs["pixel_mask"]).shape, (3, 980, 980))
+
+    def test_apply_chat_template(self):
+        # Message contains content which a mix of lists with images and image urls and string
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What do these images show?"},
+                    {"type": "image"},
+                    {"type": "image"},
+                    "What do these images show?",
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.",
+                    }
+                ],
+            },
+            {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
+        ]
+        processor = self.get_processor()
+        # Make short sequence length to test that the fake tokens are added correctly
+        rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
+        print(rendered)
+
+        expected_rendered = """<|im_start|>user
+What do these images show?<fim_prefix><|img|><fim_suffix><fim_prefix><|img|><fim_suffix><|im_end|>
+<|im_start|>assistant
+The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<|im_end|>
+<|im_start|>user
+And who is that?<|im_end|>
+<|im_start|>assistant
+"""
+        self.assertEqual(rendered, expected_rendered)
+
+    def test_image_chat_template_accepts_processing_kwargs(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            processor_kwargs={
+                "padding": "max_length",
+                "max_length": 50,
+            },
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            processor_kwargs={"max_length": 5, "truncation": True},
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
+
+        # Now test the ability to return dict
+        messages[0][0]["content"].append(
+            {
+                "type": "image",
+                "url": url_to_local_path(
+                    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
+                ),
+            }
+        )
+        out_dict = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            processor_kwargs={"max_image_size": 980},
+        )
+        self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
+
+    def test_special_mm_token_truncation(self):
+        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
+
+        processor = self.get_processor()
+
+        input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
+        image_input = self.prepare_image_inputs(batch_size=2)
+
+        _ = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            truncation=None,
+            padding=True,
+        )
+
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                return_tensors="pt",
+                truncation=True,
+                padding=True,
+                max_length=3,
+            )
--- a/tests/models/audio_spectrogram_transformer/init.py
+++ b/tests/models/audio_spectrogram_transformer/init.py
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other.", "(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world"], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645], [5349, 8, 2014, 13216, 429, 4128, 4157, 3158, 9355, 11, 7578, 404, 4849, 279, 46488, 315, 3691, 323, 4158, 304, 279, 1879, 151645, 151671]]}
				`@@ -0,0 +1 @@`
				`{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other."], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645]]}`
				`@@ -0,0 +1 @@`
				`Who was Jim Henson ? \|\|\| Jim Henson was a puppeteer`
				`@@ -0,0 +1 @@`
				{"transcriptions": ["This track is an uplifting Eurodance‑style Trance‑Pop anthem that blends the driving four‑on‑the‑floor pulse of classic Eurodance with the soaring, melodic synth work typical of modern trance‑infused pop. The duration"], "token_ids": [[1986, 3754, 374, 458, 94509, 19461, 98875, 55964, 3528, 1163, 681, 55964, 11598, 55564, 429, 57843, 279, 9842, 3040, 55964, 263, 55964, 1782, 55964, 30449, 27235, 315, 11416, 19461, 98875, 448, 279, 68897, 11, 10581, 52760, 42898, 975, 14260, 315, 6481, 97431, 55964, 13573, 2591, 2420, 13, 220, 576, 8090]]}
				`@@ -0,0 +1 @@`
				{"transcriptions": ["mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"], "scores": [-0.08922013640403748], "token_ids": [[1024, 1024, 1024, 1024, 1024, 1024, 19, 37, 132, 1024, 1024, 264, 128, 1024, 1024, 1024, 132, 1024, 58, 1024, 5, 645, 1024, 1000, 82, 52, 1024, 34, 1024, 5, 19, 68, 1007, 52, 1024, 235, 1024, 388, 1024, 27, 1024, 25, 1024, 56, 1024, 103, 1024, 1024, 727, 112, 1024, 22, 1024, 56, 1006, 1009, 405, 1024, 1024, 217, 1024, 1024, 95, 1003, 1024, 133, 1006, 1024, 1024, 1024, 1024, 1024, 1024, 1024]]}
				`@@ -0,0 +1 @@`
				`{"[MASK]": 0, "[UNK]": 1, "[PAD]": 2, "DUMMY": 3, "DUMMY2": 4, "[MASK2]": 5}`
				`@@ -0,0 +1 @@`
				[{"segmentation": [[333.96, 175.14, 338.26, 134.33, 342.55, 95.67, 348.99, 79.57, 368.32, 80.64, 371.54, 91.38, 364.03, 106.41, 356.51, 145.07, 351.14, 166.55, 350.07, 184.8, 345.77, 185.88, 332.89, 178.36, 332.89, 172.99]], "area": 2120.991099999999, "iscrowd": 0, "image_id": 39769, "bbox": [332.89, 79.57, 38.65, 106.31], "category_id": 75, "id": 1108446}, {"segmentation": [[44.03, 86.01, 112.75, 74.2, 173.96, 77.42, 175.03, 89.23, 170.74, 98.9, 147.11, 102.12, 54.77, 119.3, 53.69, 119.3, 44.03, 113.93, 41.88, 94.6, 41.88, 94.6]], "area": 4052.607, "iscrowd": 0, "image_id": 39769, "bbox": [41.88, 74.2, 133.15, 45.1], "category_id": 75, "id": 1110067}, {"segmentation": [[1.08, 473.53, 633.17, 473.53, 557.66, 376.45, 535.01, 366.74, 489.71, 305.26, 470.29, 318.2, 456.27, 351.64, 413.12, 363.51, 376.45, 358.11, 348.4, 350.56, 363.51, 331.15, 357.03, 288.0, 353.8, 257.8, 344.09, 190.92, 333.3, 177.98, 345.17, 79.82, 284.76, 130.52, 265.35, 151.01, 308.49, 189.84, 317.12, 215.73, 293.39, 243.78, 269.66, 212.49, 235.15, 199.55, 214.65, 193.08, 187.69, 217.89, 159.64, 278.29, 135.91, 313.89, 169.35, 292.31, 203.87, 281.53, 220.04, 292.31, 220.04, 307.42, 175.82, 345.17, 155.33, 360.27, 105.71, 363.51, 85.21, 374.29, 74.43, 366.74, 70.11, 465.98, 42.07, 471.37, 33.44, 457.35, 34.52, 414.2, 29.12, 368.9, 9.71, 291.24, 46.38, 209.26, 99.24, 128.36, 131.6, 107.87, 50.7, 117.57, 40.99, 103.55, 40.99, 85.21, 60.4, 77.66, 141.3, 70.11, 173.66, 72.27, 174.74, 92.76, 204.94, 72.27, 225.44, 62.56, 262.11, 56.09, 292.31, 53.93, 282.61, 81.98, 298.79, 96.0, 310.65, 102.47, 348.4, 74.43, 373.21, 81.98, 430.38, 35.6, 484.31, 23.73, 540.4, 46.38, 593.26, 66.88, 638.56, 80.9, 632.09, 145.62, 581.39, 118.65, 543.64, 130.52, 533.93, 167.19, 512.36, 197.39, 498.34, 218.97, 529.62, 253.48, 549.03, 273.98, 584.63, 276.13, 587.87, 293.39, 566.29, 305.26, 531.78, 298.79, 549.03, 319.28, 576.0, 358.11, 560.9, 376.45, 639.64, 471.37, 639.64, 2.16, 1.08, 0.0]], "area": 176277.55269999994, "iscrowd": 0, "image_id": 39769, "bbox": [1.08, 0.0, 638.56, 473.53], "category_id": 63, "id": 1605237}, {"segmentation": [[1.07, 1.18, 640.0, 3.33, 638.93, 472.59, 4.3, 479.03]], "area": 301552.6694999999, "iscrowd": 0, "image_id": 39769, "bbox": [1.07, 1.18, 638.93, 477.85], "category_id": 65, "id": 1612051}, {"segmentation": [[138.75, 319.38, 148.75, 294.38, 165.0, 246.87, 197.5, 205.63, 247.5, 203.13, 268.75, 216.88, 280.0, 239.38, 293.75, 244.38, 303.75, 241.88, 307.5, 228.13, 318.75, 220.63, 315.0, 200.63, 291.25, 171.88, 265.0, 156.88, 258.75, 148.13, 262.5, 135.63, 282.5, 123.13, 292.5, 115.63, 311.25, 108.13, 313.75, 106.88, 296.25, 93.13, 282.5, 84.38, 292.5, 64.38, 288.75, 60.63, 266.25, 54.38, 232.5, 63.12, 206.25, 70.63, 170.0, 100.63, 136.25, 114.38, 101.25, 138.13, 56.25, 194.38, 27.5, 259.38, 17.5, 299.38, 32.5, 378.13, 31.25, 448.13, 41.25, 469.38, 66.25, 466.88, 70.0, 419.38, 71.25, 391.88, 77.5, 365.63, 113.75, 364.38, 145.0, 360.63, 168.75, 349.38, 191.25, 330.63, 212.5, 319.38, 223.75, 305.63, 206.25, 286.88, 172.5, 288.13]], "area": 53301.618749999994, "iscrowd": 0, "image_id": 39769, "bbox": [17.5, 54.38, 301.25, 415.0], "category_id": 17, "id": 2190839}, {"segmentation": [[543.75, 136.88, 570.0, 114.38, 591.25, 123.13, 616.25, 140.63, 640.0, 143.13, 636.25, 124.37, 605.0, 103.13, 640.0, 103.13, 633.75, 86.88, 587.5, 73.13, 548.75, 49.38, 505.0, 35.63, 462.5, 25.63, 405.0, 48.13, 362.5, 111.88, 347.5, 179.38, 355.0, 220.63, 356.25, 230.63, 365.0, 264.38, 358.75, 266.88, 358.75, 270.63, 356.25, 291.88, 356.25, 325.63, 355.0, 338.13, 350.0, 348.13, 365.0, 354.38, 396.25, 351.88, 423.75, 355.63, 446.25, 350.63, 460.0, 345.63, 462.5, 321.88, 468.75, 306.88, 481.25, 299.38, 516.25, 341.88, 536.25, 368.13, 570.0, 369.38, 578.75, 359.38, 555.0, 330.63, 532.5, 298.13, 563.75, 299.38, 582.5, 298.13, 586.25, 286.88, 578.75, 278.13, 548.75, 269.38, 525.0, 256.88, 505.0, 206.88, 536.25, 161.88, 540.0, 149.38]], "area": 59700.95625, "iscrowd": 0, "image_id": 39769, "bbox": [347.5, 25.63, 292.5, 343.75], "category_id": 17, "id": 2190842}]
				`@@ -0,0 +1 @@`
				[{"id": 8222595, "category_id": 17, "iscrowd": 0, "bbox": [18, 54, 301, 415], "area": 53306}, {"id": 8225432, "category_id": 17, "iscrowd": 0, "bbox": [349, 26, 291, 343], "area": 59627}, {"id": 8798150, "category_id": 63, "iscrowd": 0, "bbox": [1, 0, 639, 474], "area": 174579}, {"id": 14466198, "category_id": 75, "iscrowd": 0, "bbox": [42, 74, 133, 45], "area": 4068}, {"id": 12821912, "category_id": 75, "iscrowd": 0, "bbox": [333, 80, 38, 106], "area": 2118}, {"id": 10898909, "category_id": 93, "iscrowd": 0, "bbox": [0, 0, 640, 480], "area": 2750}]
				`@@ -0,0 +1 @@`
				{"transcriptions": ["assistant\n[{\"Start\":0,\"End\":5.86,\"Speaker\":0,\"Content\":\"Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.\"}]\n", "assistant\n[{\"Start\":0,\"End\":4.82,\"Speaker\":0,\"Content\":\"Nor is Mr. Quilter's manner less interesting than his matter.\"}]\n"], "input_ids": [[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 429, 1356, 55136, 7699, 1946, 1119, 1467, 2550, 304, 4718, 3561, 13, 151645, 198, 151644, 872, 198, 151646, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151647, 198, 1986, 374, 264, 220, 20, 13, 23, 21, 6486, 7699, 11, 4486, 1356, 3114, 432, 448, 1493, 6894, 25, 5145, 882, 11, 3972, 882, 11, 29073, 3034, 11, 8883, 151645, 198], [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 429, 1356, 55136, 7699, 1946, 1119, 1467, 2550, 304, 4718, 3561, 13, 151645, 198, 151644, 872, 198, 151646, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151647, 198, 1986, 374, 264, 220, 19, 13, 23, 17, 6486, 7699, 11, 4486, 1356, 3114, 432, 448, 1493, 6894, 25, 5145, 882, 11, 3972, 882, 11, 29073, 3034, 11, 8883, 151645, 198]], "generated_ids": [[151644, 77091, 198, 58, 4913, 3479, 788, 15, 1335, 3727, 788, 20, 13, 23, 21, 1335, 82036, 788, 15, 1335, 2762, 3252, 12275, 13, 3406, 2044, 374, 279, 38471, 273, 315, 279, 6149, 6846, 11, 323, 582, 525, 15713, 311, 10565, 806, 41482, 1189, 25439, 151645, 198, 151643], [151644, 77091, 198, 58, 4913, 3479, 788, 15, 1335, 3727, 788, 19, 13, 23, 17, 1335, 82036, 788, 15, 1335, 2762, 3252, 32663, 374, 4392, 13, 3406, 2044, 594, 11566, 2686, 7040, 1091, 806, 4925, 1189, 25439, 151645, 198, 151643]]}
				`@@ -0,0 +1 @@`
				{"audio_url": "https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav", "context_info": "About VibeVoice", "transcriptions": ["assistant\n[{\"Start\":0.0,\"End\":7.56,\"Speaker\":0,\"Content\":\"VibeVoice is this novel framework designed for generating expressive, long-form, multi-speaker, conversational audio.\"}]\n"], "input_ids": [[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 429, 1356, 55136, 7699, 1946, 1119, 1467, 2550, 304, 4718, 3561, 13, 151645, 198, 151644, 872, 198, 151646, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151648, 151647, 198, 1986, 374, 264, 220, 22, 13, 20, 21, 6486, 7699, 11, 448, 4960, 3546, 25, 9975, 647, 23549, 51167, 271, 5501, 1356, 3114, 432, 448, 1493, 6894, 25, 5145, 882, 11, 3972, 882, 11, 29073, 3034, 11, 8883, 151645, 198]], "generated_ids": [[151644, 77091, 198, 58, 4913, 3479, 788, 15, 13, 15, 1335, 3727, 788, 22, 13, 20, 21, 1335, 82036, 788, 15, 1335, 2762, 3252, 53, 23549, 51167, 374, 419, 11514, 12626, 6188, 369, 23163, 77123, 11, 1293, 8460, 11, 7299, 52975, 4407, 11, 7517, 1663, 7699, 1189, 25439, 151645, 198, 151643]]}