first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

@@ -0,0 +1,186 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import random
import unittest
import numpy as np
from transformers import VibeVoiceAcousticTokenizerFeatureExtractor
from transformers.testing_utils import require_torch
from transformers.utils.import_utils import is_torch_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
if is_torch_available():
import torch
global_rng = random.Random()
# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
@require_torch
class VibeVoiceAcousticTokenizerFeatureExtractionTester:
def __init__(
self,
parent,
batch_size=7,
min_seq_length=400,
max_seq_length=2000,
feature_size=1,
padding_value=0.0,
sampling_rate=24000,
normalize_audio=True,
target_dB_FS=-25,
eps=1e-6,
):
self.parent = parent
self.batch_size = batch_size
self.min_seq_length = min_seq_length
self.max_seq_length = max_seq_length
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
self.feature_size = feature_size
self.padding_value = padding_value
self.sampling_rate = sampling_rate
self.normalize_audio = normalize_audio
self.target_dB_FS = target_dB_FS
self.eps = eps
def prepare_feat_extract_dict(self):
return {
"feature_size": self.feature_size,
"padding_value": self.padding_value,
"sampling_rate": self.sampling_rate,
"normalize_audio": self.normalize_audio,
"target_dB_FS": self.target_dB_FS,
"eps": self.eps,
}
# Copied from tests.models.encodec.test_feature_extraction_encodec.EnCodecFeatureExtractionTester.prepare_inputs_for_common
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
def _flatten(list_of_lists):
return list(itertools.chain(*list_of_lists))
if equal_length:
audio_inputs = floats_list((self.batch_size, self.max_seq_length))
else:
# make sure that inputs increase in size
audio_inputs = [
_flatten(floats_list((x, self.feature_size)))
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
]
if numpify:
audio_inputs = [np.asarray(x) for x in audio_inputs]
return audio_inputs
@require_torch
class VibeVoiceAcousticTokenizerFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = VibeVoiceAcousticTokenizerFeatureExtractor
def setUp(self):
self.feat_extract_tester = VibeVoiceAcousticTokenizerFeatureExtractionTester(self)
def test_call(self):
TOL = 1e-6
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
sampling_rate = feature_extractor.sampling_rate
audio_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_audio_inputs = [np.asarray(audio_input) for audio_input in audio_inputs]
torch_audio_inputs = [torch.tensor(audio_input) for audio_input in audio_inputs]
# Test non-batched input
encoded_sequences_1 = feature_extractor(torch_audio_inputs[0], sampling_rate=sampling_rate).input_values
encoded_sequences_2 = feature_extractor(np_audio_inputs[0], sampling_rate=sampling_rate).input_values
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=TOL))
# Test batched input
encoded_sequences_1 = feature_extractor(torch_audio_inputs, sampling_rate=sampling_rate).input_values
encoded_sequences_2 = feature_extractor(np_audio_inputs, sampling_rate=sampling_rate).input_values
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=TOL))
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
audio_samples = ds.sort("id")[:num_samples]["input_values"]
return [x["array"] for x in audio_samples]
def test_normalize_audio(self):
"""Test audio normalization functionality specific to VibeVoice."""
# Test with normalization enabled (default)
feature_extractor = VibeVoiceAcousticTokenizerFeatureExtractor(normalize_audio=True, target_dB_FS=-25)
# Test with very low amplitude audio (should increase amplitude)
low_amplitude_audio = np.random.randn(1000).astype(np.float32) * 0.01
result = feature_extractor([low_amplitude_audio])
normalized_audio = result.input_values.squeeze()
self.assertGreater(
torch.abs(normalized_audio).max().item(), torch.abs(torch.tensor(low_amplitude_audio)).max().item()
)
# Test with normalization disabled (should be close to original)
feature_extractor_no_norm = VibeVoiceAcousticTokenizerFeatureExtractor(normalize_audio=False)
result_no_norm = feature_extractor_no_norm([low_amplitude_audio])
torch.testing.assert_close(
result_no_norm.input_values.squeeze(), torch.tensor(low_amplitude_audio), rtol=1e-5, atol=1e-5
)
def test_sampling_rate_validation(self):
"""Test that sampling rate validation works correctly."""
feature_extractor = VibeVoiceAcousticTokenizerFeatureExtractor(sampling_rate=24000)
input_audio = np.random.randn(1000).astype(np.float32)
result = feature_extractor([input_audio], sampling_rate=24000)
self.assertIsInstance(result.input_values, torch.Tensor)
with self.assertRaises(ValueError):
feature_extractor([input_audio], sampling_rate=16000)
def test_padding_mask_generation(self):
"""Test that padding masks are generated correctly."""
feature_extractor = VibeVoiceAcousticTokenizerFeatureExtractor()
audio1 = np.random.randn(100).astype(np.float32)
audio2 = np.random.randn(200).astype(np.float32)
result = feature_extractor([audio1, audio2], padding=True, return_attention_mask=True)
self.assertIn("padding_mask", result)
self.assertEqual(result.padding_mask.shape, result.input_values.squeeze(1).shape)
# First sample should have some padding (False values at the end)
self.assertTrue(torch.any(~result.padding_mask[0]))
# Second sample should have no padding (all True values)
self.assertTrue(torch.all(result.padding_mask[1]))

View File

@@ -0,0 +1,344 @@
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import json
import unittest
from pathlib import Path
import numpy as np
from transformers import (
AutoFeatureExtractor,
AutoModel,
VibeVoiceAcousticTokenizerConfig,
VibeVoiceAcousticTokenizerModel,
)
from transformers.audio_utils import load_audio_librosa
from transformers.testing_utils import cleanup, is_torch_available, require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor
if is_torch_available():
import torch
@require_torch
class VibeVoiceAcousticTokenizerModelTester:
def __init__(
self,
parent,
batch_size=2,
channels=1,
hidden_size=32,
kernel_size=3,
n_filters=4,
downsampling_ratios=[2],
depths=[1, 1],
is_training=False,
):
self.parent = parent
self.batch_size = batch_size
self.channels = channels
self.is_training = is_training
self.hidden_size = hidden_size
self.kernel_size = kernel_size
self.n_filters = n_filters
self.downsampling_ratios = downsampling_ratios
self.depths = depths
def prepare_config_and_inputs(self):
input_values = floats_tensor([self.batch_size, self.channels, self.hidden_size], scale=1.0)
config = self.get_config()
# disable sampling for deterministic tests
inputs_dict = {"input_values": input_values, "sample": False}
return config, inputs_dict
def prepare_config_and_inputs_for_common(self):
config, inputs_dict = self.prepare_config_and_inputs()
return config, inputs_dict
def prepare_config_and_inputs_for_model_class(self, model_class):
input_values = floats_tensor([self.batch_size, self.channels, self.hidden_size], scale=1.0)
config = self.get_config()
# disable sampling for deterministic tests
inputs_dict = {"input_values": input_values, "sample": False}
return config, inputs_dict
def get_config(self):
return VibeVoiceAcousticTokenizerConfig(
channels=self.channels,
hidden_size=self.hidden_size,
kernel_size=self.kernel_size,
n_filters=self.n_filters,
downsampling_ratios=self.downsampling_ratios,
depths=self.depths,
)
def create_and_check_model_forward(self, config, inputs_dict):
model = VibeVoiceAcousticTokenizerModel(config=config).to(torch_device).eval()
input_values = inputs_dict["input_values"]
result = model(input_values)
# Calculate expected sequence length after downsampling
expected_seq_len = self.hidden_size // np.prod(self.downsampling_ratios)
self.parent.assertEqual(result.latents.shape, (self.batch_size, expected_seq_len, self.hidden_size))
self.parent.assertEqual(result.audio.shape, (self.batch_size, self.channels, self.hidden_size))
@require_torch
class VibeVoiceAcousticTokenizerModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (VibeVoiceAcousticTokenizerModel,) if is_torch_available() else ()
is_encoder_decoder = False
test_resize_embeddings = False
test_head_masking = False
test_pruning = False
test_cpu_offload = False
test_disk_offload_safetensors = False
test_disk_offload_bin = False
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
if "output_attentions" in inputs_dict:
inputs_dict.pop("output_attentions")
if "output_hidden_states" in inputs_dict:
inputs_dict.pop("output_hidden_states")
return inputs_dict
def setUp(self):
self.model_tester = VibeVoiceAcousticTokenizerModelTester(self)
self.config_tester = ConfigTester(
self,
config_class=VibeVoiceAcousticTokenizerConfig,
common_properties=[],
has_text_modality=False,
)
def test_config(self):
self.config_tester.run_common_tests()
def test_model_forward(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model_forward(*config_and_inputs)
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["input_values", "padding_cache", "use_cache", "sample"]
self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
@unittest.skip("VibeVoiceAcousticTokenizerModel does not have `inputs_embeds` logic")
def test_inputs_embeds(self):
pass
@unittest.skip("VibeVoiceAcousticTokenizerModel does not have `inputs_embeds` logic")
def test_model_get_set_embeddings(self):
pass
@unittest.skip("VibeVoiceAcousticTokenizerModel does not have the usual `attention` logic")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip("VibeVoiceAcousticTokenizerModel does not have the usual `attention` logic")
def test_attention_outputs(self):
pass
@unittest.skip("VibeVoiceAcousticTokenizerModel does not have the usual `hidden_states` logic")
def test_hidden_states_output(self):
pass
@unittest.skip("VibeVoiceAcousticTokenizerModel does not has no attribute `hf_device_map`")
def test_model_parallelism(self):
pass
def test_determinism(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def check_determinism(first, second):
out_1 = first.cpu().numpy()
out_2 = second.cpu().numpy()
out_1 = out_1[~np.isnan(out_1)]
out_2 = out_2[~np.isnan(out_2)]
max_diff = np.amax(np.abs(out_1 - out_2))
self.assertLessEqual(max_diff, 1e-5)
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
first = model(**self._prepare_for_class(inputs_dict, model_class)).latents
second = model(**self._prepare_for_class(inputs_dict, model_class)).latents
if isinstance(first, tuple) and isinstance(second, tuple):
for tensor1, tensor2 in zip(first, second):
check_determinism(tensor1, tensor2)
else:
check_determinism(first, second)
def test_model_outputs_equivalence(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
def set_nan_tensor_to_zero(t):
t[t != t] = 0
return t
def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
with torch.no_grad():
tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
def recursive_check(tuple_object, dict_object):
if isinstance(tuple_object, (list, tuple)):
for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
recursive_check(tuple_iterable_value, dict_iterable_value)
elif isinstance(tuple_object, dict):
for tuple_iterable_value, dict_iterable_value in zip(
tuple_object.values(), dict_object.values()
):
recursive_check(tuple_iterable_value, dict_iterable_value)
elif tuple_object is None:
return
else:
self.assertTrue(
torch.allclose(
set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
),
msg=(
"Tuple and dict output are not equal. Difference:"
f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
),
)
recursive_check(tuple_output, dict_output)
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
dict_inputs = self._prepare_for_class(inputs_dict, model_class)
check_equivalence(model, tuple_inputs, dict_inputs)
def test_encode_method(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
model = VibeVoiceAcousticTokenizerModel(config=config).to(torch_device).eval()
audio = inputs_dict["input_values"]
with torch.no_grad():
output = model.encode(audio)
self.assertIsNotNone(output.latents)
expected_seq_len = self.model_tester.hidden_size // np.prod(self.model_tester.downsampling_ratios)
self.assertEqual(
output.latents.shape, (self.model_tester.batch_size, expected_seq_len, self.model_tester.hidden_size)
)
def test_decode_method(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
model = VibeVoiceAcousticTokenizerModel(config=config).to(torch_device).eval()
audio = inputs_dict["input_values"]
with torch.no_grad():
encode_output = model.encode(audio)
decode_output = model.decode(encode_output.latents)
self.assertIsNotNone(decode_output.audio)
self.assertEqual(
decode_output.audio.shape,
(self.model_tester.batch_size, self.model_tester.channels, self.model_tester.hidden_size),
)
def test_use_cache(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
model = VibeVoiceAcousticTokenizerModel(config=config).to(torch_device).eval()
input_values = inputs_dict["input_values"]
with torch.no_grad():
output = model(input_values, use_cache=True)
self.assertIsNotNone(output.padding_cache)
self.assertIsNotNone(output.latents)
self.assertIsNotNone(output.audio)
class VibeVoiceAcousticTokenizerIntegrationTest(unittest.TestCase):
def setUp(self):
self.model_checkpoint = "microsoft/VibeVoice-AcousticTokenizer"
self.sampling_rate = 24000
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
@require_torch
def test_batch_integration(self):
"""
Reproducer which generates JSON of expected outputs:
https://gist.github.com/ebezzam/507dfd544e0a0f12402966503cbc73e6#file-reproducer_tokenizer-py
NOTE (ebezzam): had to compute expected outputs on CI runners for passing tests
"""
dtype = torch.bfloat16
# Load expected outputs
RESULTS_PATH = (
Path(__file__).parent.parent.parent / "fixtures/vibevoice/expected_acoustic_tokenizer_results.json"
)
with open(RESULTS_PATH, "r") as f:
expected_results = json.load(f)
# Get device-specific expected results
device_key = torch_device if torch_device in expected_results else "cuda"
device_results = expected_results[device_key]
expected_encoder = torch.tensor(device_results["encoder"]).to(dtype)
expected_decoder = torch.tensor(device_results["decoder"]).to(dtype)
# Prepare inputs
audio_paths = [
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/voices/en-Carter_man.wav",
"https://huggingface.co/datasets/bezzam/vibevoice_samples/resolve/main/voices/en-Frank_man.wav",
]
audio_arrays = [load_audio_librosa(path, sampling_rate=self.sampling_rate) for path in audio_paths]
feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_checkpoint)
# apply model and compare
model = AutoModel.from_pretrained(
self.model_checkpoint,
dtype=dtype,
device_map=torch_device,
).eval()
processed_audio = feature_extractor(audio_arrays, sampling_rate=self.sampling_rate).to(
torch_device, dtype=dtype
)
with torch.no_grad():
encoder_out = model.encode(processed_audio["input_values"], sample=False).latents
acoustic_decoder_out = model.decode(encoder_out).audio
encoder_out_flat = encoder_out.reshape(encoder_out.shape[0], -1)
encoder_out = encoder_out_flat[..., : expected_encoder.shape[-1]].cpu()
decoder_out = acoustic_decoder_out[..., : expected_decoder.shape[-1]].cpu()
torch.testing.assert_close(encoder_out, expected_encoder, rtol=1e-6, atol=1e-6)
torch.testing.assert_close(decoder_out, expected_decoder, rtol=1e-6, atol=1e-6)