transformers/tests/models/lasr/test_modeling_lasr.py

# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Lasr model."""

import tempfile
import unittest

from transformers import is_datasets_available, is_torch_available, pipeline
from transformers.testing_utils import (
    cleanup,
    require_torch,
    require_torch_accelerator,
    slow,
    torch_device,
)

from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask


if is_datasets_available():
    from datasets import Audio, load_dataset

if is_torch_available():
    import torch

    from transformers import (
        AutoProcessor,
        LasrCTCConfig,
        LasrEncoder,
        LasrEncoderConfig,
        LasrForCTC,
    )


class LasrEncoderModelTester:
    def __init__(
        self,
        parent,
        batch_size=13,
        seq_length=1024,
        is_training=True,
        hidden_size=64,
        num_hidden_layers=2,
        num_mel_bins=80,
        num_attention_heads=4,
        intermediate_size=256,
        conv_kernel_size=8,
        subsampling_conv_channels=32,
        subsampling_conv_kernel_size=5,
        subsampling_conv_stride=2,
        layerdrop=0.0,
    ):
        # testing suite parameters
        self.parent = parent
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.num_mel_bins = num_mel_bins
        self.is_training = is_training

        # config parameters
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.conv_kernel_size = conv_kernel_size
        self.subsampling_conv_channels = subsampling_conv_channels
        self.subsampling_conv_kernel_size = subsampling_conv_kernel_size
        self.subsampling_conv_stride = subsampling_conv_stride
        self.layerdrop = layerdrop

        self.num_mel_bins = num_mel_bins

        # output sequence length after subsampling
        self.output_seq_length = self._get_output_seq_length(self.seq_length)
        self.encoder_seq_length = self.output_seq_length
        self.key_length = self.output_seq_length

    def _get_output_seq_length(self, seq_length):
        kernel_size = self.subsampling_conv_kernel_size
        stride = self.subsampling_conv_stride
        num_layers = 2

        input_length = seq_length
        for _ in range(num_layers):
            input_length = (input_length - kernel_size) // stride + 1

        return input_length

    def prepare_config_and_inputs(self):
        input_features = floats_tensor([self.batch_size, self.seq_length, self.num_mel_bins])
        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
        config = self.get_config()

        return config, input_features, attention_mask

    def get_config(self):
        return LasrEncoderConfig(
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            conv_kernel_size=self.conv_kernel_size,
            subsampling_conv_channels=self.subsampling_conv_channels,
            subsampling_conv_kernel_size=self.subsampling_conv_kernel_size,
            subsampling_conv_stride=self.subsampling_conv_stride,
            num_mel_bins=self.num_mel_bins,
            layerdrop=self.layerdrop,
        )

    def create_and_check_model(self, config, input_features, attention_mask):
        model = LasrEncoder(config=config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(input_features, attention_mask=attention_mask)

        self.parent.assertEqual(
            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, config.hidden_size)
        )

    def prepare_config_and_inputs_for_common(self):
        config, input_features, attention_mask = self.prepare_config_and_inputs()
        inputs_dict = {
            "input_features": input_features,
            "attention_mask": attention_mask,
        }
        return config, inputs_dict

    def check_ctc_loss(self, config, input_values, *args):
        model = LasrForCTC(config=config)
        model.to(torch_device)

        # make sure that dropout is disabled
        model.eval()

        input_values = input_values[:3]
        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)

        # pad input
        for i in range(len(input_lengths)):
            input_values[i, input_lengths[i] :] = 0.0
            attention_mask[i, input_lengths[i] :] = 0

        model.config.ctc_loss_reduction = "sum"
        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()

        model.config.ctc_loss_reduction = "mean"
        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()

        self.parent.assertTrue(isinstance(sum_loss, float))
        self.parent.assertTrue(isinstance(mean_loss, float))


@require_torch
class LasrEncoderModelTest(ModelTesterMixin, unittest.TestCase):
    all_model_classes = (LasrEncoder,) if is_torch_available() else ()

    test_resize_embeddings = False
    test_torch_exportable = True

    def setUp(self):
        self.model_tester = LasrEncoderModelTester(self)
        self.config_tester = ConfigTester(self, config_class=LasrEncoderConfig, has_text_modality=False)

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    @unittest.skip(reason="LasrEncoder does not use inputs_embeds")
    def test_model_get_set_embeddings(self):
        pass


class LasrForCTCModelTester:
    def __init__(self, parent, encoder_kwargs=None, is_training=True, vocab_size=128, pad_token_id=0):
        if encoder_kwargs is None:
            encoder_kwargs = {}

        self.parent = parent
        self.encoder_model_tester = LasrEncoderModelTester(parent, **encoder_kwargs)
        self.is_training = is_training

        self.batch_size = self.encoder_model_tester.batch_size
        self.output_seq_length = self.encoder_model_tester.output_seq_length
        self.num_hidden_layers = self.encoder_model_tester.num_hidden_layers
        self.seq_length = vocab_size
        self.hidden_size = self.encoder_model_tester.hidden_size

        self.vocab_size = vocab_size
        self.pad_token_id = pad_token_id
        self.encoder_seq_length = self.encoder_model_tester.encoder_seq_length

    def prepare_config_and_inputs(self):
        _, input_features, attention_mask = self.encoder_model_tester.prepare_config_and_inputs()
        config = self.get_config()
        return config, input_features, attention_mask

    def get_config(self):
        return LasrCTCConfig(
            encoder_config=self.encoder_model_tester.get_config(),
            vocab_size=self.vocab_size,
            pad_token_id=self.pad_token_id,
        )

    def create_and_check_model(self, config, input_features, attention_mask):
        model = LasrForCTC(config=config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(input_features, attention_mask=attention_mask)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.output_seq_length, self.vocab_size))

    def prepare_config_and_inputs_for_common(self):
        config, input_features, attention_mask = self.prepare_config_and_inputs()
        inputs_dict = {
            "input_features": input_features,
            "attention_mask": attention_mask,
        }
        return config, inputs_dict

    def test_ctc_loss_inference(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.encoder_model_tester.check_ctc_loss(*config_and_inputs)


@require_torch
class LasrForCTCModelTest(ModelTesterMixin, unittest.TestCase):
    all_model_classes = (LasrForCTC,) if is_torch_available() else ()
    all_generative_model_classes = ()  # LasrForCTC has a custom genereate method
    pipeline_model_mapping = (
        {
            "feature-extraction": LasrEncoder,
            "automatic-speech-recognition": LasrForCTC,
        }
        if is_torch_available()
        else {}
    )

    test_attention_outputs = False

    test_resize_embeddings = False
    test_torch_exportable = True

    _is_composite = True

    def setUp(self):
        self.model_tester = LasrForCTCModelTester(self)
        self.config_tester = ConfigTester(self, config_class=LasrCTCConfig)

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    @unittest.skip(reason="LasrEncoder does not use inputs_embeds")
    def test_model_get_set_embeddings(self):
        pass

    # Original function assumes vision+text model, so overwrite since Lasr is audio+text
    # Below is modified from `tests/models/granite_speech/test_modeling_granite_speech.py`
    def test_sdpa_can_dispatch_composite_models(self):
        if not self.has_attentions:
            self.skipTest(reason="Model architecture does not support attentions")

        if not self._is_composite:
            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")

        for model_class in self.all_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            model = model_class(config)

            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname)
                model_sdpa = model_class.from_pretrained(tmpdirname)
                model_sdpa = model_sdpa.eval().to(torch_device)

                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
                model_eager = model_eager.eval().to(torch_device)
                self.assertTrue(model_eager.config._attn_implementation == "eager")

                for name, submodule in model_eager.named_modules():
                    class_name = submodule.__class__.__name__
                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
                        raise ValueError("The eager model should not have SDPA attention layers")


class LasrForCTCIntegrationTest(unittest.TestCase):
    _dataset = None

    @classmethod
    def setUp(cls):
        cls.checkpoint_name = "hf-internal-testing/lasr-test"
        cls.dtype = torch.bfloat16
        cls.processor = AutoProcessor.from_pretrained(cls.checkpoint_name)

    def tearDown(self):
        cleanup(torch_device, gc_collect=True)

    @classmethod
    def _load_dataset(cls):
        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
        if cls._dataset is None:
            cls._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
            cls._dataset = cls._dataset.cast_column(
                "audio", Audio(sampling_rate=cls.processor.feature_extractor.sampling_rate)
            )

    def _load_datasamples(self, num_samples):
        self._load_dataset()
        ds = self._dataset
        speech_samples = ds.sort("id")[:num_samples]["audio"]
        return [x["array"] for x in speech_samples]

    @slow
    @require_torch_accelerator
    def test_model_integration(self):
        # fmt: off
        EXPECTED_TOKENS = torch.tensor([
            [315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,4,503,28,28,95,0,0,65,0,0,0,57,57,0,0,7,0,0,14,0,0,0,27,13,13,0,35,0,46,0,0,0,0,16,0,0,7,0,0,192,15,0,0,15,46,0,0,54,100,5,5,0,5,5,71,0,0,0,6,0,0,0,19,19,0,0,0,150,0,142,142,0,0,106,106,100,100,15,15,0,0,0,18,0,0,50,50,121,121,0,30,279,279,0,0,0,63,63,0,0,0,0,188,0,0,0,5,5,27,27,121,0,0,0,9,0,0,0,0,0,0,0,0,0]
        ])
        # fmt: on

        EXPECTED_TRANSCRIPTIONS = [
            "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
        ]

        samples = self._load_datasamples(1)
        model = LasrForCTC.from_pretrained(self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device)
        model.eval()
        model.to(torch_device)

        # -- apply
        inputs = self.processor(samples)
        inputs.to(torch_device, dtype=self.dtype)
        predicted_ids = model.generate(**inputs)
        torch.testing.assert_close(predicted_ids.cpu(), EXPECTED_TOKENS)
        predicted_transcripts = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
        self.assertListEqual(predicted_transcripts, EXPECTED_TRANSCRIPTIONS)

    @slow
    @require_torch_accelerator
    def test_model_integration_batched(self):
        # fmt: off
        EXPECTED_TOKENS = torch.tensor([
            [315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,4,503,28,28,95,0,0,65,0,0,0,57,57,0,0,7,0,0,14,0,0,0,27,13,13,0,35,0,46,0,0,0,0,16,0,0,7,0,0,192,15,0,0,15,46,0,0,54,100,5,5,0,5,5,71,0,0,0,6,0,0,0,19,19,0,0,0,150,0,142,142,0,0,106,106,100,100,15,15,0,0,0,18,0,0,50,50,121,121,0,30,279,279,0,0,0,63,63,0,0,0,0,188,0,0,0,5,5,27,27,121,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            [244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,0,0,0,57,57,0,0,0,0,0,315,0,0,9,9,4,4,503,28,0,95,0,65,0,34,34,5,0,0,0,179,0,0,17,31,0,0,0,0,4,343,343,0,0,0,0,0,24,24,0,0,65,65,65,0,0,228,228,0,22,22,0,0,0,0,304,304,0,0,0,0,63,0,0,0,0,0,0,0,0,0,332,0,0,17,31,31,0,0,0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            [144,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,450,450,0,0,5,5,0,294,294,0,0,0,0,0,0,0,48,48,0,0,0,0,0,102,0,0,0,0,149,0,0,0,0,0,0,47,0,0,228,228,0,198,0,0,0,0,0,136,136,11,11,5,5,56,56,0,0,0,16,16,0,0,7,0,0,0,286,286,26,26,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,64,64,0,0,0,0,0,398,68,68,35,35,21,21,11,11,5,0,0,0,19,0,0,0,4,74,74,11,11,35,0,0,0,0,49,0,10,10,39,0,0,0,0,305,0,13,21,21,22,22,0,0,0,0,0,0,360,360,0,0,0,294,294,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,5,0,178,178,0,95,0,71,71,0,0,0,0,0,290,11,62,17,17,0,0,137,0,0,0,0,0,89,0,99,99,22,22,0,0,0,0,19,0,0,53,0,5,0,0,58,0,0,5,5,147,8,8,5,0,0,0,4,4,13,30,0,0,30,61,61,0,0,0,0,110,0,0,35,0,0,0,58,58,0,101,23,23,41,0,0,0,0,18,0,0,7,7,0,0,192,0,0,82,82,0,0,0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            [144,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,299,0,0,0,0,0,391,0,0,91,91,0,0,0,104,104,28,44,44,8,5,5,0,0,0,0,50,50,222,222,130,130,0,0,0,0,0,0,98,103,103,0,191,191,33,0,227,227,0,354,0,0,163,10,0,0,8,56,56,34,34,5,5,0,0,424,0,0,0,0,0,0,57,57,0,0,0,0,0,58,0,29,29,41,0,0,0,0,0,0,0,0,240,240,33,10,10,52,0,0,0,0,0,0,0,0,0,351,351,0,0,0,134,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,265,0,0,0,212,212,0,0,207,207,0,112,112,0,0,0,24,24,0,0,53,0,0,0,0,0,127,0,0,0,0,0,317,0,0,0,0,0,0,0,16,16,0,0,0,0,0,0,0,0,4,0,74,0,0,0,153,0,20,0,0,0,0,32,0,0,60,11,11,0,30,11,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            [163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,17,17,0,0,272,0,34,34,5,0,0,0,0,59,0,84,84,314,314,5,5,0,0,0,0,0,0,0,142,142,0,0,0,14,14,0,0,97,97,25,8,8,16,16,0,0,38,0,0,0,0,0,0,0,0,0,0,362,0,27,27,0,0,0,240,28,28,0,248,0,5,0,0,19,0,0,93,0,0,0,0,168,0,0,438,0,0,0,0,0,0,0,208,208,36,36,8,8,22,5,5,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,358,358,0,0,5,0,56,0,34,5,5,0,0,0,0,0,0,139,139,324,324,0,0,5,5,73,10,10,0,0,4,0,0,135,20,122,5,5,0,0,0,0,0,142,142,0,0,0,0,80,80,0,0,0,0,0,0,0,0,0,4,0,17,0,0,123,123,0,0,29,29,0,0,0,0,80,0,0,0,14,0,4,0,260,0,0,0,22,0,13,0,0,0,0,0,0,0,167,0,10,10,21,21,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,315,315,0,0,9,9,0,0,141,0,61,197,0,8,0,0,0,191,13,13,0,5,5,65,65,34,34,5,5,0,4,397,397,0,0,0,0,5,0,30,11,11,242,5,5,0,0,0,0,0,0,0,0,4,4,5,5,21,21,23,23,46,46,0,0,0,102,0,0,0,0,171,171,0,0,0,0,0,0,0,390,390,0,0,0,0,24,0,0,7,7,0,0,458,458,0,0,0,0,0,380,380,0,0,0,0,0,48,0,0,0,315,315,0,0,9,9,0,0,0,132,0,26,0,0,52,0,31,0,0,0,0,0,0,0,0,0,0,0,0,294,294,0,12,12,0,18,18,0,0,0,47,100,0,5,70,70,0,0,63,0,0,0,4,4,88,88,0,10,60,60,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,315,0,0,9,9,0,4,260,260,13,70,17,17,0,0,132,132,0,205,0,129,0,0,31,0,0,0,0,0,0,0,0,0,0,0,0,413,0,0,5,5,0,63,63,0,4,4,5,0,73,73,65,0,0,0,0,0,14,0,0,0,0,0,0,54,222,222,31,31,0,0,269,269,0,0,0,0,0,4,4,5,5,100,0,0,27,0,0,0,94,94,0,0,7,0,0,0,383,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,360,0,0,0,0,37,0,0,0,336,336,0,0,5,5,0,6,6,0,0,288,288,0,0,0,14,0,0,0,0,0,0,0,155,0,11,0,0,0,0,233,13,0,0,13,31,31,0,0,24,24,0,0,14,0,0,0,200,61,61,52,52,235,235,0,0,0,51,11,11,60,60,0,0,6,38,38,0,0,0,0,0,0,0,0,0,0,0,0,0,216,10,172,172,8,0,0,179,179,0,0,0,0,0,0,152,0,0,0,0,0,0,0,0,0,0]
        ])
        # fmt: on

        EXPECTED_TRANSCRIPTIONS = [
            "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
            "Nor is Mr. Quilter's manner less interesting than hismanner.\"",
            'He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind."',
            "He has grave doubts whether Sir Frederick Leton's work is really Greek after all, and can discover in it but little of rocky Ithaca,",
            'Lynell\'s pictures are a sort of "Up Guards and Aam" paintings, and Mason\'s exquisite idylls are as national as a Jingo poem. Mr. Burket Foster\'s landscapes smile at one much in the same way that Mr. Carker used to flash his teeth, and Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath, "Next man,"',
        ]

        samples = self._load_datasamples(5)
        model = LasrForCTC.from_pretrained(
            self.checkpoint_name,
            torch_dtype=self.dtype,
            device_map=torch_device,
        )
        model.eval()
        model.to(torch_device)

        # -- apply
        inputs = self.processor(samples)
        inputs.to(torch_device, dtype=self.dtype)
        predicted_ids = model.generate(**inputs)
        torch.testing.assert_close(predicted_ids.cpu(), EXPECTED_TOKENS)
        predicted_transcripts = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
        self.assertListEqual(predicted_transcripts, EXPECTED_TRANSCRIPTIONS)

    # TODO: @eustlb, this test is here for now but should eventually be moved to test_pipelines_automatic_speech_recognition.py
    @slow
    @require_torch_accelerator
    def test_model_integration_pipe_with_chunk(self):
        EXPECTED_TRANSCRIPTIONS = [
            {"text": "Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."}
        ]

        samples = self._load_datasamples(1)
        pipe = pipeline(
            task="automatic-speech-recognition", model=self.checkpoint_name, dtype=self.dtype, device_map=torch_device
        )
        self.assertEqual(pipe.type, "ctc")
        predicted_transcripts = pipe(samples, chunk_length_s=3, stride_length_s=1)
        self.assertListEqual(predicted_transcripts, EXPECTED_TRANSCRIPTIONS)