# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights # reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import shutil import tempfile import unittest from parameterized import parameterized from transformers import ( AutoProcessor, AutoTokenizer, MusicFlamingoProcessor, WhisperFeatureExtractor, ) from transformers.testing_utils import require_librosa, require_torch, require_torchaudio from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin class MusicFlamingoProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = MusicFlamingoProcessor @classmethod @require_torch @require_torchaudio def setUpClass(cls): cls.checkpoint = "nvidia/music-flamingo-2601-hf" cls.tmpdirname = tempfile.mkdtemp() processor = MusicFlamingoProcessor.from_pretrained(cls.checkpoint) processor.save_pretrained(cls.tmpdirname) @require_torch @require_torchaudio def get_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer @require_torch @require_torchaudio def get_audio_processor(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).audio_processor @require_torch @require_torchaudio def get_processor(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdirname, ignore_errors=True) @require_torch @require_torchaudio def test_can_load_various_tokenizers(self): processor = MusicFlamingoProcessor.from_pretrained(self.checkpoint) tokenizer = AutoTokenizer.from_pretrained(self.checkpoint) self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__) @require_torch @require_torchaudio def test_save_load_pretrained_default(self): tokenizer = AutoTokenizer.from_pretrained(self.checkpoint) processor = MusicFlamingoProcessor.from_pretrained(self.checkpoint) feature_extractor = processor.feature_extractor processor = MusicFlamingoProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) with tempfile.TemporaryDirectory() as tmpdir: processor.save_pretrained(tmpdir) reloaded = MusicFlamingoProcessor.from_pretrained(tmpdir) self.assertEqual(reloaded.tokenizer.get_vocab(), tokenizer.get_vocab()) self.assertEqual(reloaded.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertIsInstance(reloaded.feature_extractor, WhisperFeatureExtractor) @require_torch @require_torchaudio def test_tokenizer_integration(self): slow_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, use_fast=False) fast_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, from_slow=True, legacy=False) prompt = ( "<|im_start|>system\nAnswer the questions.<|im_end|>" "<|im_start|>user\nWhat is it?<|im_end|>" "<|im_start|>assistant\n" ) EXPECTED_OUTPUT = [ "<|im_start|>", "system", "Ċ", "Answer", "Ġthe", "Ġquestions", ".", "<|im_end|>", "<|im_start|>", "user", "Ċ", "", "What", "Ġis", "Ġit", "?", "<|im_end|>", "<|im_start|>", "assistant", "Ċ", ] self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) @require_torch @require_torchaudio def test_chat_template(self): processor = AutoProcessor.from_pretrained(self.checkpoint) expected_prompt = ( "<|im_start|>system\nYou are Music Flamingo, a multimodal assistant for language and music. " "On each turn you receive an audio clip which contains music and optional text, " "you will receive at least one or both; use your world knowledge and reasoning " "to help the user with any task. Interpret the entirety of the content any input music" "--regardlenss of whether the user calls it audio, music, or sound.<|im_end|>\n" "<|im_start|>user\nWhat is surprising about the relationship between the barking and the music?<|im_end|>\n" "<|im_start|>assistant\n" ) conversations = [ { "role": "user", "content": [ { "type": "text", "text": "What is surprising about the relationship between the barking and the music?", }, { "type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav", }, ], } ] formatted = processor.tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=True) self.assertEqual(expected_prompt, formatted) @require_torch @require_torchaudio def test_transcription_helpers_not_supported(self): processor = AutoProcessor.from_pretrained(self.checkpoint) self.assertFalse(hasattr(processor, "apply_transcription_request")) self.assertFalse(hasattr(processor, "_strip_assistant_prefix_and_quotes")) # Overwrite to remove skip numpy inputs (still need to keep as many cases as parent) @require_librosa @parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")]) def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str): if return_tensors == "np": self.skipTest("MusicFlamingo only supports PyTorch tensors") self._test_apply_chat_template( "audio", batch_size, return_tensors, "audio_input_name", "feature_extractor", MODALITY_INPUT_DATA["audio"] )