first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/models/vibevoice_asr/init.py
+++ b/tests/models/vibevoice_asr/init.py
--- a/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py
+++ b/tests/models/vibevoice_asr/test_modeling_vibevoice_asr.py
@@ -0,0 +1,397 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+from parameterized import parameterized
+
+from transformers import (
+    VibeVoiceAsrConfig,
+    VibeVoiceAsrForConditionalGeneration,
+    VibeVoiceAsrModel,
+    is_datasets_available,
+    is_torch_available,
+)
+from transformers.testing_utils import (
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.trainer_utils import set_seed
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_datasets_available():
+    from datasets import Audio, load_dataset
+
+if is_torch_available():
+    import torch
+
+
+class VibeVoiceAsrModelTester:
+    """
+    Builds a tiny VibeVoice ASR config and synthetic inputs for testing.
+    """
+
+    def __init__(
+        self,
+        parent,
+        audio_token_id=0,
+        seq_length=25,
+        audio_samples=24000,  # 1 second at 24kHz
+        text_config={
+            "model_type": "qwen2",
+            "intermediate_size": 36,
+            "initializer_range": 0.02,
+            "hidden_size": 32,
+            "max_position_embeddings": 52,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 4,
+            "vocab_size": 99,
+            "pad_token_id": 1,  # Ensure pad token != audio token
+        },
+        acoustic_tokenizer_encoder_config={
+            "model_type": "vibevoice_acoustic_tokenizer_encoder",
+            "hidden_size": 16,
+            "kernel_size": 3,
+            "n_filters": 4,
+            "downsampling_ratios": [2],
+            "depths": [1, 1],
+        },
+        semantic_tokenizer_encoder_config={
+            "model_type": "vibevoice_acoustic_tokenizer_encoder",
+            "channels": 1,
+            "hidden_size": 32,  # 2x acoustic hidden size
+            "kernel_size": 3,
+            "n_filters": 4,
+            "downsampling_ratios": [2],
+            "depths": [1, 1],
+        },
+        is_training=True,
+    ):
+        self.parent = parent
+        self.audio_token_id = audio_token_id
+        self.seq_length = seq_length
+        self.audio_samples = audio_samples
+        self.is_training = is_training
+        self.text_config = text_config
+        self.acoustic_tokenizer_encoder_config = acoustic_tokenizer_encoder_config
+        self.semantic_tokenizer_encoder_config = semantic_tokenizer_encoder_config
+        self.batch_size = 2
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.encoder_seq_length = seq_length
+
+    def get_config(self):
+        return VibeVoiceAsrConfig(
+            acoustic_tokenizer_encoder_config=self.acoustic_tokenizer_encoder_config,
+            semantic_tokenizer_encoder_config=self.semantic_tokenizer_encoder_config,
+            text_config=self.text_config,
+            audio_token_id=self.audio_token_id,
+        )
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.text_config["vocab_size"])
+        attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
+        input_values = floats_tensor([self.batch_size, 1, self.audio_samples])
+        padding_mask = torch.ones([self.batch_size, self.audio_samples], dtype=torch.bool, device=torch_device)
+        config = self.get_config()
+        return config, input_ids, attention_mask, input_values, padding_mask
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, input_values, padding_mask = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "input_values": input_values,
+            "padding_mask": padding_mask,
+        }
+
+        return config, inputs_dict
+
+
+@require_torch
+class VibeVoiceAsrForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (VibeVoiceAsrModel, VibeVoiceAsrForConditionalGeneration) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"audio-text-to-text": VibeVoiceAsrForConditionalGeneration} if is_torch_available() else {}
+    )
+    _is_composite = True
+    # Acoustic/semantic tokenizers run under torch.no_grad() in get_audio_features,
+    # so their params never receive grads — the mixin's force-unfreeze can't change that.
+    test_all_params_have_gradient = False
+
+    def setUp(self):
+        self.model_tester = VibeVoiceAsrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VibeVoiceAsrConfig, has_text_modality=False)
+
+    @unittest.skip(
+        reason="This test does not apply to VibeVoiceAsr since inputs_embeds corresponding to audio tokens are replaced when input features are provided."
+    )
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has no separate base model without a head.")
+    def test_model_base_model_prefix(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr audio components do not use attention.")
+    def test_get_audio_features_attentions(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has unique audio processing with acoustic and semantic tokenizers.")
+    def test_get_audio_features_hidden_states(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has slight randomness due to VAE sampling.")
+    def test_determinism(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has slight randomness due to VAE sampling.")
+    def test_batching_equivalence(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has slight randomness due to VAE sampling.")
+    def test_save_load(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has slight randomness due to VAE sampling.")
+    def test_generate_continue_from_past_key_values(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has slight randomness due to VAE sampling.")
+    def test_model_outputs_equivalence(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has slight randomness due to VAE sampling.")
+    def test_left_padding_compatibility(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has slight randomness due to VAE sampling.")
+    def test_forward_with_logits_to_keep(self):
+        pass
+
+    @unittest.skip(reason="VibeVoiceAsr has slight randomness due to VAE sampling.")
+    def test_generate_methods_with_logits_to_keep(self):
+        pass
+
+    def test_sdpa_can_dispatch_composite_models(self):
+        # VibeVoiceAsr is audio+text composite; but audio components do not use attention
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                # SDPA (default)
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                language_model_sdpa = model_sdpa.base_model.language_model
+                text_attn = "sdpa" if language_model_sdpa._supports_sdpa else "eager"
+
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+                self.assertTrue(language_model_sdpa.config._attn_implementation == text_attn)
+
+                # Eager
+                model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
+                model_eager = model_eager.eval().to(torch_device)
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.base_model.language_model.config._attn_implementation == "eager")
+
+                for _, submodule in model_eager.named_modules():
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                        raise ValueError("The eager model should not have SDPA attention layers")
+
+    @parameterized.expand([True, False, None])
+    def test_get_audio_features_output(self, return_dict: bool | None):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self._audio_features_prepare_config_and_inputs()
+            if return_dict is not None:
+                config.return_dict = return_dict
+
+            model = model_class(config).eval()
+            model = model.to(torch_device)
+
+            torch.manual_seed(0)
+            with torch.no_grad():
+                outputs = model.get_audio_features(**inputs_dict)
+
+            if return_dict in (True, None):
+                last_hidden_state_shape = outputs.last_hidden_state.shape
+                batch_size = inputs_dict["input_values"].shape[0]
+                self.assertEqual(
+                    last_hidden_state_shape[0],
+                    batch_size,
+                    f"batch_size mismatch, full shape: {last_hidden_state_shape}",
+                )
+                audio_config = config.acoustic_tokenizer_encoder_config
+                hidden_size = audio_config.hidden_size
+                self.assertEqual(
+                    last_hidden_state_shape[-1],
+                    hidden_size,
+                    f"hidden_size mismatch, full shape: {last_hidden_state_shape}",
+                )
+
+            else:
+                self.assertIsInstance(outputs, tuple, "get_audio_features() must return a tuple if return_dict=False")
+
+
+@require_torch
+class VibeVoiceAsrForConditionalGenerationIntegrationTest(unittest.TestCase):
+    _dataset = None
+
+    @classmethod
+    def setUp(cls):
+        from transformers import AutoProcessor
+        from transformers.testing_utils import cleanup
+
+        cleanup(torch_device, gc_collect=True)
+        cls.checkpoint = "microsoft/VibeVoice-ASR-HF"
+        cls.processor = AutoProcessor.from_pretrained(cls.checkpoint)
+
+    def tearDown(self):
+        from transformers.testing_utils import cleanup
+
+        cleanup(torch_device, gc_collect=True)
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            cls._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+            cls._dataset = cls._dataset.cast_column(
+                "audio", Audio(sampling_rate=cls.processor.feature_extractor.sampling_rate)
+            )
+
+    def _load_datasamples(self, num_samples):
+        self._load_dataset()
+        ds = self._dataset
+        speech_samples = ds.sort("id")[:num_samples]["audio"]
+        return [x["array"] for x in speech_samples]
+
+    @slow
+    def test_single(self):
+        """
+        reproducer: https://gist.github.com/ebezzam/e1200bcecdc29e87dadd9d8423ae7ecb#file-reproducer_vibevoice_asr-py
+        """
+        set_seed(42)
+
+        path = Path(__file__).parent.parent.parent / "fixtures/vibevoice_asr/expected_results_single.json"
+        with open(path, "r", encoding="utf-8") as f:
+            expected_outputs = json.load(f)
+
+        samples = self._load_datasamples(1)
+        conversation = [{"role": "user", "content": [{"type": "audio", "audio": samples[0]}]}]
+
+        model = VibeVoiceAsrForConditionalGeneration.from_pretrained(
+            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
+        )
+
+        inputs = self.processor.apply_chat_template(conversation, tokenize=True, return_dict=True).to(
+            model.device, dtype=model.dtype
+        )
+        torch.testing.assert_close(inputs["input_ids"].cpu(), torch.tensor(expected_outputs["input_ids"]))
+
+        output = model.generate(**inputs)
+        gen_ids = output[:, inputs["input_ids"].shape[1] :]
+        torch.testing.assert_close(gen_ids.cpu(), torch.tensor(expected_outputs["generated_ids"]))
+        txt = self.processor.decode(gen_ids, skip_special_tokens=True)
+        self.assertListEqual(txt, expected_outputs["transcriptions"])
+
+    @slow
+    def test_batch(self):
+        """
+        reproducer: https://gist.github.com/ebezzam/e1200bcecdc29e87dadd9d8423ae7ecb#file-reproducer_vibevoice_asr_batch-py
+        """
+        set_seed(42)
+
+        path = Path(__file__).parent.parent.parent / "fixtures/vibevoice_asr/expected_results_batch.json"
+        with open(path, "r", encoding="utf-8") as f:
+            expected_outputs = json.load(f)
+
+        samples = self._load_datasamples(2)
+        conversation = [
+            [{"role": "user", "content": [{"type": "audio", "audio": samples[0]}]}],
+            [{"role": "user", "content": [{"type": "audio", "audio": samples[1]}]}],
+        ]
+
+        model = VibeVoiceAsrForConditionalGeneration.from_pretrained(
+            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
+        )
+        inputs = self.processor.apply_chat_template(conversation, tokenize=True, return_dict=True).to(
+            model.device, dtype=model.dtype
+        )
+
+        output = model.generate(**inputs)
+        gen_ids = output[:, inputs["input_ids"].shape[1] :]
+        for i, exp_gen in enumerate(expected_outputs["generated_ids"]):
+            actual_gen = gen_ids[i, : len(exp_gen)]
+            torch.testing.assert_close(actual_gen.cpu(), torch.tensor(exp_gen))
+        txt = self.processor.decode(gen_ids, skip_special_tokens=True)
+        self.assertListEqual(txt, expected_outputs["transcriptions"])
+
+    @slow
+    def test_single_with_context(self):
+        """
+        reproducer: tests/models/vibevoice_asr/reproducer_vibevoice_asr_with_context.py
+        """
+        set_seed(42)
+
+        path = Path(__file__).parent.parent.parent / "fixtures/vibevoice_asr/expected_results_with_context.json"
+        with open(path, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "About VibeVoice",
+                    },
+                    {
+                        "type": "audio",
+                        "path": "https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav",
+                    },
+                ],
+            }
+        ]
+
+        model = VibeVoiceAsrForConditionalGeneration.from_pretrained(
+            self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
+        )
+
+        inputs = self.processor.apply_chat_template(conversation, tokenize=True, return_dict=True).to(
+            model.device, dtype=model.dtype
+        )
+        torch.testing.assert_close(inputs["input_ids"].cpu(), torch.tensor(raw["input_ids"]))
+
+        output = model.generate(**inputs)
+        gen_ids = output[:, inputs["input_ids"].shape[1] :]
+        torch.testing.assert_close(gen_ids.cpu(), torch.tensor(raw["generated_ids"]))
+        txt = self.processor.decode(gen_ids, skip_special_tokens=True)
+        self.assertListEqual(txt, raw["transcriptions"])
--- a/tests/models/vibevoice_asr/test_processing_vibevoice_asr.py
+++ b/tests/models/vibevoice_asr/test_processing_vibevoice_asr.py
@@ -0,0 +1,149 @@
+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+from parameterized import parameterized
+
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    VibeVoiceAcousticTokenizerFeatureExtractor,
+    VibeVoiceAsrProcessor,
+)
+from transformers.testing_utils import require_torch
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class VibeVoiceAsrProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = VibeVoiceAsrProcessor
+
+    @classmethod
+    @require_torch
+    def setUpClass(cls):
+        cls.checkpoint = "microsoft/VibeVoice-ASR-HF"
+        cls.tmpdirname = tempfile.mkdtemp()
+
+        processor = VibeVoiceAsrProcessor.from_pretrained(cls.checkpoint)
+        processor.save_pretrained(cls.tmpdirname)
+
+    @require_torch
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    @require_torch
+    def get_feature_extractor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).feature_extractor
+
+    @require_torch
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
+
+    @require_torch
+    def test_can_load_various_tokenizers(self):
+        processor = VibeVoiceAsrProcessor.from_pretrained(self.checkpoint)
+        tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
+        self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
+
+    @require_torch
+    def test_save_load_pretrained_default(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
+        processor = VibeVoiceAsrProcessor.from_pretrained(self.checkpoint)
+        feature_extractor = processor.feature_extractor
+
+        processor = VibeVoiceAsrProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor.save_pretrained(tmpdir)
+            reloaded = VibeVoiceAsrProcessor.from_pretrained(tmpdir)
+
+        self.assertEqual(reloaded.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertEqual(reloaded.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(reloaded.feature_extractor, VibeVoiceAcousticTokenizerFeatureExtractor)
+
+    @require_torch
+    def test_apply_transcription_request_single(self):
+        processor = AutoProcessor.from_pretrained(self.checkpoint)
+
+        audio_url = "https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav"
+        helper_outputs = processor.apply_transcription_request(audio=audio_url, prompt="About VibeVoice")
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "About VibeVoice"},
+                    {
+                        "type": "audio",
+                        "path": "https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/realtime_model/vibevoice_tts_german.wav",
+                    },
+                ],
+            }
+        ]
+        manual_outputs = processor.apply_chat_template(
+            conversation,
+            tokenize=True,
+            return_dict=True,
+        )
+
+        for key in ("input_ids", "attention_mask", "input_values", "padding_mask"):
+            self.assertIn(key, helper_outputs)
+            self.assertTrue(helper_outputs[key].equal(manual_outputs[key]))
+
+    @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
+    def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
+        self.skipTest("VibeVoiceAsrProcessor does not support chat templates with text-only inputs.")
+
+    def test_apply_chat_template_assistant_mask(self):
+        self.skipTest("VibeVoiceAsrProcessor does not support chat templates with text-only inputs.")
+
+    @require_torch
+    def test_decode_output_formats(self):
+        import torch
+
+        processor = VibeVoiceAsrProcessor.from_pretrained(self.checkpoint)
+
+        # fmt: off
+        # reproducer: https://gist.github.com/ebezzam/e1200bcecdc29e87dadd9d8423ae7ecb#file-reproducer_generated_ids-py
+        generated_ids = torch.tensor([[151644,  77091,    198,     58,   4913,   3479,    788,     15,   1335,
+           3727,    788,     22,     13,     20,     21,   1335,  82036,    788,
+             15,   1335,   2762,   3252,    693,    586,  40683,    374,    264,
+          11514,  12626,   6188,    369,  23163,  77123,     11,   1293,   8460,
+             11,   7299,  52975,   4407,   7517,   1663,   7699,   1189,  25439,
+         151645,    198, 151643]]
+        )
+        # fmt: on
+
+        # test parsed output
+        dicts = processor.decode(generated_ids, return_format="parsed")
+        self.assertIsInstance(dicts, list)
+        self.assertIsInstance(dicts[0], list)
+        self.assertIsInstance(dicts[0][0], dict)
+        self.assertIn("Content", dicts[0][0])
+        self.assertIn("Start", dicts[0][0])
+        self.assertIn("End", dicts[0][0])
+        self.assertIsInstance(dicts[0][0]["Start"], float)
+        self.assertIsInstance(dicts[0][0]["End"], float)
+
+        # test transcript only
+        transcript = processor.decode(generated_ids, return_format="transcription_only")
+        self.assertIsInstance(transcript, list)
+        self.assertIsInstance(transcript[0], str)