first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

0
tests/models/__init__.py Normal file
View File

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,200 @@
# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import pytest
from transformers import is_torch_available
from transformers.testing_utils import cleanup, require_torch, require_torch_accelerator, slow, torch_device
if is_torch_available():
import torch
from transformers import AfmoeForCausalLM, AfmoeModel, AutoTokenizer
from transformers.conversion_mapping import get_model_conversion_mapping
from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
class AfmoeModelTester(CausalLMModelTester):
if is_torch_available():
base_model_class = AfmoeModel
def __init__(
self,
parent,
batch_size=4,
seq_length=12,
is_training=True,
use_input_mask=True,
use_token_type_ids=False,
use_labels=True,
vocab_size=64,
hidden_size=32,
intermediate_size=16,
moe_intermediate_size=16,
num_hidden_layers=2,
num_dense_layers=1,
num_attention_heads=16,
num_key_value_heads=16,
head_dim=128,
hidden_act="silu",
max_position_embeddings=128,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=False,
rope_theta=10000.0,
rope_parameters=None,
num_experts=4,
num_experts_per_tok=2,
num_shared_experts=2,
route_norm=True,
route_scale=1.0,
global_attn_every_n_layers=2,
sliding_window=128,
attention_dropout=0.0,
):
super().__init__(
parent=parent,
batch_size=batch_size,
seq_length=seq_length,
is_training=is_training,
use_input_mask=use_input_mask,
use_token_type_ids=use_token_type_ids,
use_labels=use_labels,
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
)
self.use_cache = use_cache
self.head_dim = head_dim
self.rms_norm_eps = rms_norm_eps
self.rope_theta = rope_theta
self.moe_intermediate_size = moe_intermediate_size
self.num_dense_layers = num_dense_layers
self.num_experts = num_experts
self.num_experts_per_tok = num_experts_per_tok
self.num_shared_experts = num_shared_experts
self.route_norm = route_norm
self.route_scale = route_scale
self.global_attn_every_n_layers = global_attn_every_n_layers
self.sliding_window = sliding_window
self.attention_dropout = attention_dropout
@require_torch
class AfmoeModelTest(CausalLMModelTest, unittest.TestCase):
model_tester_class = AfmoeModelTester
all_model_classes = (AfmoeModel, AfmoeForCausalLM) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": AfmoeModel, "text-generation": AfmoeForCausalLM} if is_torch_available() else {}
)
@unittest.skip("Afmoe applies key/query norm which doesn't work with packing")
def test_eager_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip("Afmoe applies key/query norm which doesn't work with packing")
def test_sdpa_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip("Afmoe applies key/query norm which doesn't work with packing")
def test_model_rope_scaling_frequencies(self):
pass
@unittest.skip("Afmoe has moe, output can be different")
def test_model_outputs_equivalence(self, **kwargs):
pass
def test_router_logits_without_aux_loss(self):
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.num_dense_layers = 0
config.output_router_logits = True
input_ids = input_dict["input_ids"]
attention_mask = input_ids.ne(1).to(torch_device)
model = AfmoeForCausalLM(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=attention_mask)
self.assertIsNotNone(result.router_logits)
self.assertEqual(result.router_logits[0].shape[-1], config.num_experts)
self.assertIsNone(result.aux_loss)
def test_moe_legacy_conversion_mapping_registered(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
model = AfmoeModel(config)
weight_mapping = get_model_conversion_mapping(model)
found_fused_expert_converter = any(
"mlp.experts.*.gate_proj.weight" in mapping.source_patterns
and "mlp.experts.gate_up_proj" in mapping.target_patterns
for mapping in weight_mapping
)
self.assertTrue(found_fused_expert_converter)
@require_torch_accelerator
@slow
class AfmoeIntegrationTest(unittest.TestCase):
def tearDown(self):
# See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
cleanup(torch_device, gc_collect=False)
@slow
@require_torch_accelerator
@pytest.mark.torch_compile_test
def test_compile_static_cache(self):
num_tokens_to_generate = 24
prompts = [
"Simply put, the theory of relativity states that ",
"My favorite all time favorite condiment is ketchup.",
]
checkpoint = "arcee-ai/trinity-nano-preview"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
model = AfmoeForCausalLM.from_pretrained(checkpoint, device_map=torch_device, dtype=torch.bfloat16)
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
generated_ids = model.generate(**inputs, max_new_tokens=num_tokens_to_generate, do_sample=False)
dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
generated_ids = model.generate(
**inputs,
max_new_tokens=num_tokens_to_generate,
do_sample=False,
cache_implementation="static",
)
static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(dynamic_text, static_text)
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
generated_ids = model.generate(
**inputs,
max_new_tokens=num_tokens_to_generate,
do_sample=False,
cache_implementation="static",
)
static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(dynamic_text, static_compiled_text)

View File

View File

@@ -0,0 +1,569 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch AIMv2 model."""
import inspect
import tempfile
import unittest
import numpy as np
import requests
from parameterized import parameterized
from transformers import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig
from transformers.testing_utils import (
is_flaky,
require_torch,
require_vision,
slow,
torch_device,
)
from transformers.utils import (
is_torch_available,
is_vision_available,
)
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
ModelTesterMixin,
_test_eager_matches_sdpa_inference,
floats_tensor,
ids_tensor,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from torch import nn
from transformers import (
Aimv2Model,
Aimv2TextModel,
Aimv2VisionModel,
)
if is_vision_available():
from PIL import Image
from transformers import AutoImageProcessor, AutoProcessor
class Aimv2VisionModelTester:
def __init__(
self,
parent,
batch_size=12,
image_size=30,
patch_size=2,
num_channels=3,
is_training=False,
hidden_size=32,
projection_dim=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.is_training = is_training
self.hidden_size = hidden_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
config = self.get_config()
return config, pixel_values
def get_config(self):
return Aimv2VisionConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
)
def create_and_check_model(self, config, pixel_values):
model = Aimv2VisionModel(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(pixel_values)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
class Aimv2ModelTesterMixin(ModelTesterMixin):
"""
Subclass of ModelTesterMixin with methods specific to testing Aimv2 models.
The SDPA equivalence test is overridden here because Aimv2 models may have test/vision/text+vision inputs,
different output logits, and are not supposed to be used or tested with padding_side="left".
"""
def test_sdpa_can_dispatch_composite_models(self):
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
# Load the model with SDPA
model_sdpa = model_class.from_pretrained(tmpdirname)
# Load model with eager attention
model_eager = model_class.from_pretrained(
tmpdirname,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
if hasattr(model_sdpa, "vision_model"):
self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
if hasattr(model_sdpa, "text_model"):
self.assertTrue(model_sdpa.text_model.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.config._attn_implementation == "eager")
@require_torch
class Aimv2VisionModelTest(Aimv2ModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as Aimv2 does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (Aimv2VisionModel,) if is_torch_available() else ()
test_resize_embeddings = False
def setUp(self):
self.model_tester = Aimv2VisionModelTester(self)
self.config_tester = ConfigTester(
self, config_class=Aimv2VisionConfig, has_text_modality=False, hidden_size=32
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="Aimv2 does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_get_set_embeddings(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
class Aimv2TextModelTester:
def __init__(
self,
parent,
batch_size=12,
seq_length=7,
is_training=False,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
projection_dim=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
max_position_embeddings=25,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.max_position_embeddings = max_position_embeddings
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if input_mask is not None:
batch_size, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
for batch_idx, start_index in enumerate(rnd_start_indices):
input_mask[batch_idx, :start_index] = 1
input_mask[batch_idx, start_index:] = 0
config = self.get_config()
return config, input_ids, input_mask
def get_config(self):
return Aimv2TextConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
max_position_embeddings=self.max_position_embeddings,
)
def create_and_check_model(self, config, input_ids, input_mask):
model = Aimv2TextModel(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, input_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class Aimv2TextModelTest(Aimv2ModelTesterMixin, unittest.TestCase):
all_model_classes = (Aimv2TextModel,) if is_torch_available() else ()
test_resize_embeddings = False
def setUp(self):
self.model_tester = Aimv2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=Aimv2TextConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="Aimv2 does not use inputs_embeds")
def test_inputs_embeds(self):
pass
class Aimv2ModelTester:
def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=False):
if text_kwargs is None:
text_kwargs = {}
if vision_kwargs is None:
vision_kwargs = {}
self.parent = parent
self.text_model_tester = Aimv2TextModelTester(parent, **text_kwargs)
self.vision_model_tester = Aimv2VisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training
def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
config = self.get_config()
return config, input_ids, attention_mask, pixel_values
def get_config(self):
return Aimv2Config(
text_config=self.text_model_tester.get_config(),
vision_config=self.vision_model_tester.get_config(),
projection_dim=64,
)
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
model = Aimv2Model(config).to(torch_device).eval()
with torch.no_grad():
result = model(input_ids, pixel_values, attention_mask)
self.parent.assertEqual(
result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
)
self.parent.assertEqual(
result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, pixel_values = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"pixel_values": pixel_values,
}
return config, inputs_dict
@require_torch
class Aimv2ModelTest(Aimv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
additional_model_inputs = ["pixel_values"]
all_model_classes = (Aimv2Model,) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": Aimv2Model, "image-feature-extraction": Aimv2VisionModel}
if is_torch_available()
else {}
)
test_resize_embeddings = False
test_attention_outputs = False
_is_composite = True
def setUp(self):
self.model_tester = Aimv2ModelTester(self)
common_properties = ["projection_dim", "logit_scale_init_value"]
self.config_tester = ConfigTester(
self, config_class=Aimv2Config, has_text_modality=False, common_properties=common_properties
)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
print(config_and_inputs)
self.model_tester.create_and_check_model(*config_and_inputs)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="Hidden_states is tested in individual model tests")
def test_hidden_states_output(self):
pass
@unittest.skip(reason="Retain_grad is tested in individual model tests")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip(reason="Aimv2Model does not have input/output embeddings")
def test_model_get_set_embeddings(self):
pass
@unittest.skip("Size mismatch on CUDA")
def test_multi_gpu_data_parallel_forward(self):
pass
def test_load_vision_text_config(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Save Aimv2Config and check if we can load Aimv2VisionConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
vision_config = Aimv2VisionConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
# Save Aimv2Config and check if we can load Aimv2TextConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
text_config = Aimv2TextConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
@parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
@is_flaky(
max_attempts=2,
description="sdpa gets nan values in some places while eager is fine. Except those places, the values are close",
)
def test_eager_matches_sdpa_inference(
self,
name,
dtype,
padding_side,
use_attention_mask,
output_attentions,
enable_kernels,
):
"We need to relax a bit the `atols` for fp32 here due to the altup projections"
atols = {
("cpu", False, torch.float32): 1e-6,
("cpu", False, torch.float16): 5e-3,
("cpu", False, torch.bfloat16): 3e-2, # this was relaxed
("cpu", True, torch.float32): 1e-6,
("cpu", True, torch.float16): 5e-3,
("cpu", True, torch.bfloat16): 3e-2, # this was relaxed
("cuda", False, torch.float32): 1e-6,
("cuda", False, torch.bfloat16): 3e-2, # this was relaxed
("cuda", False, torch.float16): 5e-3,
("cuda", True, torch.float32): 1e-6,
("cuda", True, torch.bfloat16): 3e-2, # this was relaxed
("cuda", True, torch.float16): 5e-3,
}
_test_eager_matches_sdpa_inference(
self, name, dtype, padding_side, use_attention_mask, output_attentions, enable_kernels, atols=atols
)
@require_vision
@require_torch
class Aimv2ModelIntegrationTest(unittest.TestCase):
@slow
def test_inference(self):
model_name = "apple/aimv2-large-patch14-224-lit"
model = Aimv2Model.from_pretrained(model_name, device_map=torch_device)
processor = AutoProcessor.from_pretrained(model_name)
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = processor(
text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
).to(model.device)
# Forward pass
with torch.no_grad():
outputs = model(**inputs)
# Verify the logits
self.assertEqual(
outputs.logits_per_image.shape,
torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
)
self.assertEqual(
outputs.logits_per_text.shape,
torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
)
# handle device
expected_logits = torch.tensor([[33.3550, 26.4255]]).to(model.device)
torch.testing.assert_close(outputs.logits_per_image, expected_logits, atol=1e-3, rtol=1e-3)
@require_vision
@require_torch
class Aimv2VisionModelIntegrationTests(unittest.TestCase):
@slow
def test_inference(self):
model_name = "apple/aimv2-large-patch14-224"
model = Aimv2VisionModel.from_pretrained(model_name, device_map=torch_device)
processor = AutoImageProcessor.from_pretrained(model_name)
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = processor(image, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model(**inputs)
# Verify logits shape
self.assertEqual(output.last_hidden_state.shape, torch.Size([1, 256, 1024]))
# Verify logits slice
# fmt: off
expected_logits = torch.tensor(
[[ 0.0510, 0.0806, -0.0990, -0.0154],
[ 2.7850, -2.5143, -0.3320, 2.4196],
[ 2.8179, -2.4089, -0.2770, 2.3218],
[ 2.7641, -2.4114, -0.3684, 2.2998],
[ 2.7972, -2.3180, -0.4490, 2.2302],
[ 2.8584, -2.5322, -0.2302, 2.4936],
[-2.7849, 2.4121, 1.3670, -1.5514]]).to(model.device)
# fmt: on
output_slice = output.last_hidden_state.squeeze(0)[0:7, 0:4]
self.assertTrue(torch.allclose(output_slice, expected_logits, atol=1e-3))
@slow
def test_inference_for_native_resolution(self):
model_name = "apple/aimv2-large-patch14-native"
model = Aimv2VisionModel.from_pretrained(model_name, device_map="auto")
processor = AutoImageProcessor.from_pretrained(model_name)
image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = processor(image, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model(**inputs)
# Verify logits shape
self.assertEqual(output.last_hidden_state.shape, torch.Size([1, 1530, 1024]))
# Verify logits slice
# fmt: off
expected_logits = torch.tensor(
[[-1.3342, 0.3720, 0.0963, 0.4159],
[-1.5328, 0.4677, 0.0936, 0.4321],
[-0.3775, -0.2758, -0.0803, -0.5367],
[-1.3877, 0.5561, -1.9064, -1.1766],
[-0.5148, 0.0108, -0.4515, -0.6402],
[-0.3400, -0.1711, -0.1855, -0.4219],
[-1.2877, -0.0585, -0.1646, 0.7420]]).to(model.device)
# fmt: on
output_slice = output.last_hidden_state.squeeze(0)[0:7, 0:4]
self.assertTrue(torch.allclose(output_slice, expected_logits, atol=1e-3))

View File

View File

@@ -0,0 +1,327 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import AlbertConfig, is_torch_available
from transformers.models.auto import get_values
from transformers.testing_utils import require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
MODEL_FOR_PRETRAINING_MAPPING,
AlbertForMaskedLM,
AlbertForMultipleChoice,
AlbertForPreTraining,
AlbertForQuestionAnswering,
AlbertForSequenceClassification,
AlbertForTokenClassification,
AlbertModel,
)
class AlbertModelTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=32,
embedding_size=8,
hidden_size=16,
num_hidden_layers=2,
# this needs to be the same as `num_hidden_layers`!
num_hidden_groups=2,
num_attention_heads=4,
intermediate_size=20,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=8,
type_vocab_size=2,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_hidden_groups = num_hidden_groups
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = self.get_config()
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def get_config(self):
return AlbertConfig(
vocab_size=self.vocab_size,
embedding_size=self.embedding_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range,
num_hidden_groups=self.num_hidden_groups,
inner_group_num=1,
)
def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = AlbertModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
result = model(input_ids, token_type_ids=token_type_ids)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def create_and_check_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = AlbertForPreTraining(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
labels=token_labels,
sentence_order_label=sequence_labels,
)
self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, config.num_labels))
def create_and_check_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = AlbertForMaskedLM(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_for_question_answering(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = AlbertForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
start_positions=sequence_labels,
end_positions=sequence_labels,
)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def create_and_check_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = AlbertForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_multiple_choice(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = AlbertForMultipleChoice(config=config)
model.to(torch_device)
model.eval()
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
result = model(
multiple_choice_inputs_ids,
attention_mask=multiple_choice_input_mask,
token_type_ids=multiple_choice_token_type_ids,
labels=choice_labels,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class AlbertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(
AlbertModel,
AlbertForPreTraining,
AlbertForMaskedLM,
AlbertForMultipleChoice,
AlbertForSequenceClassification,
AlbertForTokenClassification,
AlbertForQuestionAnswering,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{
"feature-extraction": AlbertModel,
"fill-mask": AlbertForMaskedLM,
"text-classification": AlbertForSequenceClassification,
"token-classification": AlbertForTokenClassification,
"zero-shot": AlbertForSequenceClassification,
}
if is_torch_available()
else {}
)
# special case for ForPreTraining model
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
if return_labels:
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
)
inputs_dict["sentence_order_label"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device
)
return inputs_dict
def setUp(self):
self.model_tester = AlbertModelTester(self)
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_pretraining(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
model_name = "albert/albert-base-v1"
model = AlbertModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch
class AlbertModelIntegrationTest(unittest.TestCase):
@slow
def test_inference_no_head_absolute_embedding(self):
model = AlbertModel.from_pretrained("albert/albert-base-v2")
input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
with torch.no_grad():
output = model(input_ids, attention_mask=attention_mask)[0]
expected_shape = torch.Size((1, 11, 768))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[[-0.6513, 1.5035, -0.2766], [-0.6515, 1.5046, -0.2780], [-0.6512, 1.5049, -0.2784]]]
)
torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4)

View File

@@ -0,0 +1,56 @@
# Copyright 2019 Hugging Face inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import AlbertTokenizer
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers
from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor
from ...test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
@require_sentencepiece
@require_tokenizers
class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "albert/albert-base-v1"
tokenizer_class = AlbertTokenizer
# Integration test data - expected outputs for the default input string
integration_expected_tokens = ['▁this', '▁is', '▁a', '▁test', '', '😊', '▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁false', '.', '', '生活的真谛是', '▁hi', '▁hello', '▁hi', '▁hello', '▁hello', '', '<', 's', '>', '▁hi', '<', 's', '>', 'there', '▁the', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁hello', '.', '▁but', '▁i', 'rd', '▁and', '', '', '▁i', 'rd', '', '', '▁hey', '▁how', '▁are', '▁you', '▁doing'] # fmt: skip
integration_expected_token_ids = [48, 25, 21, 1289, 13, 1, 31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 4997, 9, 13, 1, 4148, 10975, 4148, 10975, 10975, 13, 1, 18, 1, 4148, 1, 18, 1, 1887, 14, 249, 3724, 378, 44, 7428, 13665, 45, 10975, 9, 47, 31, 897, 17, 13, 1, 31, 897, 13, 1, 8409, 184, 50, 42, 845] # fmt: skip
integration_expected_decoded_text = "this is a test <unk> i was born in 92000, and this is false. <unk> hi hello hi hello hello <unk>s<unk> hi<unk>s<unk>there the following string should be properly encoded: hello. but ird and <unk> ird <unk> hey how are you doing"
@classmethod
def setUpClass(cls):
super().setUpClass()
from_pretrained_id = "albert/albert-base-v1"
tokenizer = AlbertTokenizer.from_pretrained(from_pretrained_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(cls.tmpdirname)
# Build backend for slow tokenizer from the fast tokenizer's SentencePiece model
vocab_file = getattr(tokenizer, "vocab_file", None)
extractor = SentencePieceExtractor(vocab_file)
vocab_ids, vocab_scores, merges = extractor.extract()
tokenizer_from_vocab = AlbertTokenizer(vocab=vocab_scores, merges=merges)
tokenizer_from_vocab.pad_token = tokenizer_from_vocab.eos_token
cls.tokenizers = [tokenizer, tokenizer_from_vocab]

View File

View File

@@ -0,0 +1,549 @@
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch ALIGN model."""
import inspect
import math
import tempfile
import unittest
import requests
from transformers import AlignConfig, AlignProcessor, AlignTextConfig, AlignVisionConfig
from transformers.testing_utils import (
require_torch,
require_vision,
slow,
torch_device,
)
from transformers.utils import is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
floats_tensor,
ids_tensor,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
AlignModel,
AlignTextModel,
AlignVisionModel,
)
if is_vision_available():
from PIL import Image
class AlignVisionModelTester:
def __init__(
self,
parent,
batch_size=12,
image_size=32,
num_channels=3,
depth_coefficient=3.1,
kernel_sizes=[3, 3, 5],
in_channels=[32, 16, 24],
out_channels=[16, 24, 30],
hidden_dim=64,
strides=[1, 1, 2],
num_block_repeats=[1, 1, 2],
expand_ratios=[1, 6, 6],
is_training=True,
hidden_act="gelu",
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.num_channels = num_channels
self.depth_coefficient = depth_coefficient
self.kernel_sizes = kernel_sizes
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_dim = hidden_dim
self.strides = strides
self.num_block_repeats = num_block_repeats
self.expand_ratios = expand_ratios
self.is_training = is_training
self.hidden_act = hidden_act
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
config = self.get_config()
return config, pixel_values
def get_config(self):
return AlignVisionConfig(
num_channels=self.num_channels,
depth_coefficient=self.depth_coefficient,
kernel_sizes=self.kernel_sizes,
in_channels=self.in_channels,
out_channels=self.out_channels,
hidden_dim=self.hidden_dim,
strides=self.strides,
num_block_repeats=self.num_block_repeats,
expand_ratios=self.expand_ratios,
hidden_act=self.hidden_act,
)
def create_and_check_model(self, config, pixel_values):
model = AlignVisionModel(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(pixel_values)
patch_size = self.image_size // 4
self.parent.assertEqual(
result.last_hidden_state.shape, (self.batch_size, config.hidden_dim, patch_size, patch_size)
)
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, config.hidden_dim))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class AlignVisionModelTest(ModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as ALIGN does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (AlignVisionModel,) if is_torch_available() else ()
test_resize_embeddings = False
has_attentions = False
def setUp(self):
self.model_tester = AlignVisionModelTester(self)
self.config_tester = ConfigTester(
self,
config_class=AlignVisionConfig,
has_text_modality=False,
hidden_size=32,
common_properties=["num_channels", "image_size"],
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="AlignVisionModel does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="AlignVisionModel does not use inputs_embeds")
def test_inputs_embeds_matches_input_ids(self):
pass
@unittest.skip(reason="AlignVisionModel does not support input and output embeddings")
def test_model_get_set_embeddings(self):
pass
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
num_blocks = sum(config.num_block_repeats) * 4
self.assertEqual(len(hidden_states), num_blocks)
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[self.model_tester.image_size // 2, self.model_tester.image_size // 2],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
@unittest.skip
def test_training(self):
pass
@unittest.skip
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant(self):
pass
@unittest.skip(
reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@slow
def test_model_from_pretrained(self):
model_name = "kakaobrain/align-base"
model = AlignVisionModel.from_pretrained(model_name)
self.assertIsNotNone(model)
class AlignTextModelTester:
def __init__(
self,
parent,
batch_size=12,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
config = self.get_config()
return config, input_ids, token_type_ids, input_mask
def get_config(self):
return AlignTextConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
is_decoder=False,
initializer_range=self.initializer_range,
)
def create_and_check_model(self, config, input_ids, token_type_ids, input_mask):
model = AlignTextModel(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
result = model(input_ids, token_type_ids=token_type_ids)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
token_type_ids,
input_mask,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class AlignTextModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (AlignTextModel,) if is_torch_available() else ()
def setUp(self):
self.model_tester = AlignTextModelTester(self)
self.config_tester = ConfigTester(self, config_class=AlignTextConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="This module does not support standalone training")
def test_training(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing_use_reentrant_true(self):
pass
@unittest.skip(reason="ALIGN does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="Align does not use inputs_embeds")
def test_inputs_embeds_matches_input_ids(self):
pass
@slow
def test_model_from_pretrained(self):
model_name = "kakaobrain/align-base"
model = AlignTextModel.from_pretrained(model_name)
self.assertIsNotNone(model)
class AlignModelTester:
def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
if text_kwargs is None:
text_kwargs = {}
if vision_kwargs is None:
vision_kwargs = {}
self.parent = parent
self.text_model_tester = AlignTextModelTester(parent, **text_kwargs)
self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training
def prepare_config_and_inputs(self):
test_config, input_ids, token_type_ids, input_mask = self.text_model_tester.prepare_config_and_inputs()
vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
config = self.get_config()
return config, input_ids, token_type_ids, input_mask, pixel_values
def get_config(self):
return AlignConfig(
text_config=self.text_model_tester.get_config().to_dict(),
vision_config=self.vision_model_tester.get_config().to_dict(),
projection_dim=64,
)
def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
model = AlignModel(config).to(torch_device).eval()
with torch.no_grad():
result = model(input_ids, pixel_values, attention_mask, token_type_ids)
self.parent.assertEqual(
result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
)
self.parent.assertEqual(
result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, token_type_ids, input_mask, pixel_values = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"token_type_ids": token_type_ids,
"attention_mask": input_mask,
"pixel_values": pixel_values,
"return_loss": True,
}
return config, inputs_dict
@require_torch
class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (AlignModel,) if is_torch_available() else ()
pipeline_model_mapping = {"feature-extraction": AlignModel} if is_torch_available() else {}
test_resize_embeddings = False
test_attention_outputs = False
has_attentions = False
skip_test_image_features_output_shape = True # Align uses index -3 for hidden_size instead of -1
def setUp(self):
self.model_tester = AlignModelTester(self)
self.config_tester = ConfigTester(
self,
config_class=AlignConfig,
has_text_modality=False,
common_properties=["projection_dim", "temperature_init_value"],
)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_config(self):
self.config_tester.run_common_tests()
def test_batching_equivalence(self, atol=3e-4, rtol=3e-4):
super().test_batching_equivalence(atol=atol, rtol=rtol)
@unittest.skip(reason="Start to fail after using torch `cu118`.")
def test_multi_gpu_data_parallel_forward(self):
pass
@unittest.skip(reason="Hidden_states is tested in individual model tests")
def test_hidden_states_output(self):
pass
@unittest.skip(reason="Inputs_embeds is tested in individual model tests")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="Align does not use inputs_embeds")
def test_inputs_embeds_matches_input_ids(self):
pass
@unittest.skip(reason="Retain_grad is tested in individual model tests")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip(reason="AlignModel does not have input/output embeddings")
def test_model_get_set_embeddings(self):
pass
def test_load_vision_text_config(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Save AlignConfig and check if we can load AlignVisionConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
vision_config = AlignVisionConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
# Save AlignConfig and check if we can load AlignTextConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
text_config = AlignTextConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
def _image_features_get_expected_num_attentions(self, model_tester=None):
return sum(
math.ceil(self.model_tester.vision_model_tester.depth_coefficient * repeat)
for repeat in self.model_tester.vision_model_tester.num_block_repeats
)
@slow
def test_model_from_pretrained(self):
model_name = "kakaobrain/align-base"
model = AlignModel.from_pretrained(model_name)
self.assertIsNotNone(model)
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@require_vision
@require_torch
class AlignModelIntegrationTest(unittest.TestCase):
@slow
def test_inference(self):
model_name = "kakaobrain/align-base"
model = AlignModel.from_pretrained(model_name).to(torch_device)
processor = AlignProcessor.from_pretrained(model_name)
image = prepare_img()
texts = ["a photo of a cat", "a photo of a dog"]
inputs = processor(images=image, text=texts, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
self.assertEqual(
outputs.logits_per_image.shape,
torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
)
self.assertEqual(
outputs.logits_per_text.shape,
torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
)
expected_logits = torch.tensor([[9.7093, 3.4679]], device=torch_device)
torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3)

View File

@@ -0,0 +1,69 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_vision
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import AlignProcessor
@require_vision
class AlignProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = AlignProcessor
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"want",
"##want",
"##ed",
"wa",
"un",
"runn",
"##ing",
",",
"low",
"lowest",
]
vocab_file = f"{cls.tmpdirname}/vocab.txt"
with open(vocab_file, "w", encoding="utf-8") as f:
f.write("\n".join(vocab_tokens))
tokenizer = tokenizer_class(vocab_file)
return tokenizer
@classmethod
def _setup_image_processor(cls):
image_processor_class = cls._get_component_class_from_processor("image_processor")
image_processor = image_processor_class(
do_resize=True,
size=20,
do_normalize=True,
image_mean=[0.48145466, 0.4578275, 0.40821073],
image_std=[0.26862954, 0.26130258, 0.27577711],
)
return image_processor

View File

View File

@@ -0,0 +1,545 @@
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch AltCLIP model."""
import inspect
import unittest
import numpy as np
import requests
from parameterized import parameterized
from transformers import AltCLIPConfig, AltCLIPProcessor, AltCLIPTextConfig, AltCLIPVisionConfig
from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
ModelTesterMixin,
floats_tensor,
ids_tensor,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
import torch.nn as nn
from transformers import AltCLIPModel, AltCLIPTextModel, AltCLIPVisionModel
if is_vision_available():
from PIL import Image
class AltCLIPVisionModelTester:
def __init__(
self,
parent,
batch_size=12,
image_size=30,
patch_size=2,
num_channels=3,
is_training=True,
hidden_size=32,
projection_dim=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.is_training = is_training
self.hidden_size = hidden_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.scope = scope
# in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches + 1
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
config = self.get_config()
return config, pixel_values
def get_config(self):
return AltCLIPVisionConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
initializer_range=self.initializer_range,
)
def create_and_check_model(self, config, pixel_values):
model = AltCLIPVisionModel(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(pixel_values)
# expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
image_size = (self.image_size, self.image_size)
patch_size = (self.patch_size, self.patch_size)
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class AltCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (AltCLIPVisionModel,) if is_torch_available() else ()
test_resize_embeddings = False
def setUp(self):
self.model_tester = AltCLIPVisionModelTester(self)
self.config_tester = ConfigTester(
self, config_class=AltCLIPVisionConfig, has_text_modality=False, hidden_size=36
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="CLIP does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_get_set_embeddings(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="This module does not support standalone training")
def test_training(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing_use_reentrant_true(self):
pass
@unittest.skip(reason="AltCLIPVisionModel use the same cv backbone with CLIP model.")
def test_model_from_pretrained(self):
pass
class AltCLIPTextModelTester:
def __init__(
self,
parent,
batch_size=12,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
projection_dim=32,
project_dim=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
max_position_embeddings=512,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.projection_dim = projection_dim
self.project_dim = project_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if input_mask is not None:
batch_size, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
for batch_idx, start_index in enumerate(rnd_start_indices):
input_mask[batch_idx, :start_index] = 1
input_mask[batch_idx, start_index:] = 0
config = self.get_config()
return config, input_ids, input_mask
def get_config(self):
return AltCLIPTextConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
project_dim=self.project_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
pad_token_id=1,
)
def create_and_check_model(self, config, input_ids, input_mask):
model = AltCLIPTextModel(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, input_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (AltCLIPTextModel,) if is_torch_available() else ()
# AltCLIPTextModel has large embeddings relative to model size, so we need higher split percentages
model_split_percents = [0.5, 0.8, 0.9]
# TODO (@SunMarc): Fix me
@unittest.skip(reason="It's broken.")
def test_resize_tokens_embeddings(self):
super().test_resize_tokens_embeddings()
def setUp(self):
self.model_tester = AltCLIPTextModelTester(self)
self.config_tester = ConfigTester(self, config_class=AltCLIPTextConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="This module does not support standalone training")
def test_training(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip(reason="This module does not support standalone training")
def test_training_gradient_checkpointing_use_reentrant_true(self):
pass
def test_model_outputs_equivalence(self):
pass
@unittest.skip(reason="Result of the model is a dict")
def test_hidden_states_output(self):
pass
@unittest.skip(reason="AltCLIP does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@slow
def test_model_from_pretrained(self):
model_name = "BAAI/AltCLIP"
model = AltCLIPTextModel.from_pretrained(model_name)
self.assertIsNotNone(model)
class AltCLIPModelTester:
def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
if text_kwargs is None:
text_kwargs = {}
if vision_kwargs is None:
vision_kwargs = {}
self.parent = parent
self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs)
self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs)
self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test
self.is_training = is_training
def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
config = self.get_config()
return config, input_ids, attention_mask, pixel_values
def get_config(self):
return AltCLIPConfig(
text_config=self.text_model_tester.get_config().to_dict(),
vision_config=self.vision_model_tester.get_config().to_dict(),
projection_dim=64,
)
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
model = AltCLIPModel(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
model(input_ids, pixel_values, attention_mask)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, pixel_values = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"pixel_values": pixel_values,
"return_loss": True,
}
return config, inputs_dict
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@require_torch
class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (AltCLIPModel,) if is_torch_available() else ()
pipeline_model_mapping = {"feature-extraction": AltCLIPModel} if is_torch_available() else {}
test_resize_embeddings = False
test_attention_outputs = False
additional_model_inputs = ["pixel_values"]
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self,
pipeline_test_case_name,
config_class,
model_architecture,
tokenizer_name,
image_processor_name,
feature_extractor_name,
processor_name,
):
if pipeline_test_case_name == "FeatureExtractionPipelineTests":
return True
return False
def setUp(self):
self.model_tester = AltCLIPModelTester(self)
self.config_tester = ConfigTester(
self,
config_class=AltCLIPConfig,
has_text_modality=False,
common_properties=["projection_dim", "logit_scale_init_value"],
)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_config(self):
self.config_tester.run_common_tests()
@parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
@slow
@is_flaky()
def test_eager_matches_sdpa_inference(self, *args):
# adding only flaky decorator here and call the parent test method
return getattr(ModelTesterMixin, self._testMethodName)(self)
@unittest.skip(reason="Hidden_states is tested in individual model tests")
def test_hidden_states_output(self):
pass
@unittest.skip(reason="Inputs_embeds is tested in individual model tests")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="Retain_grad is tested in individual model tests")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip(reason="CLIPModel does not have input/output embeddings")
def test_model_get_set_embeddings(self):
pass
@slow
def test_model_from_pretrained(self):
model_name = "BAAI/AltCLIP"
model = AltCLIPModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_vision
@require_torch
class AltCLIPModelIntegrationTest(unittest.TestCase):
@slow
def test_inference(self):
model_name = "BAAI/AltCLIP"
model = AltCLIPModel.from_pretrained(model_name).to(torch_device)
processor = AltCLIPProcessor.from_pretrained(model_name)
image = prepare_img()
inputs = processor(text=["一张猫的照片", "一张狗的照片"], images=image, padding=True, return_tensors="pt").to(torch_device) # fmt: skip
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
self.assertEqual(
outputs.logits_per_image.shape,
torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
)
self.assertEqual(
outputs.logits_per_text.shape,
torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
)
probs = outputs.logits_per_image.softmax(dim=1)
expected_probs = torch.tensor([[9.9942e-01, 5.7805e-04]], device=torch_device)
torch.testing.assert_close(probs, expected_probs, rtol=5e-3, atol=5e-3)
@slow
def test_inference_interpolate_pos_encoding(self):
# ViT models have an `interpolate_pos_encoding` argument in their forward method,
# allowing to interpolate the pre-trained position embeddings in order to use
# the model on higher resolutions. The DINO model by Facebook AI leverages this
# to visualize self-attention on higher resolution images.
model_name = "BAAI/AltCLIP"
model = AltCLIPModel.from_pretrained(model_name).to(torch_device)
image_processor = AltCLIPProcessor.from_pretrained(
model_name, size={"shortest_edge": 180}, crop_size={"height": 180, "width": 180}
)
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
# interpolate_pos_encodiung false should return value error
with self.assertRaises(ValueError, msg="doesn't match model"):
with torch.no_grad():
model(**inputs, interpolate_pos_encoding=False)
# forward pass
with torch.no_grad():
outputs = model(**inputs, interpolate_pos_encoding=True)
# verify the logits
expected_shape = torch.Size((1, 145, 1024))
print("nilesh ")
print(outputs.vision_model_output.last_hidden_state.shape)
print(outputs.vision_model_output.last_hidden_state[0, :3, :3])
self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
expected_slice = torch.tensor(
[
[-0.3577, -0.5977, 0.3555],
[0.4544, 0.1660, 0.6583],
[1.1715, -0.4870, 0.1645],
]
).to(torch_device)
torch.testing.assert_close(
outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4
)

View File

@@ -0,0 +1,27 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import AltCLIPProcessor
from transformers.testing_utils import require_vision
from ...test_processing_common import ProcessorTesterMixin
@require_vision
class AltClipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = AltCLIPProcessor
model_id = "BAAI/AltCLIP"

View File

View File

@@ -0,0 +1,66 @@
# Copyright 2025 The HuggingFace Inc. team and the Swiss AI Initiative. All rights reserved.
#
# This code is based on HuggingFace's LLaMA implementation in this library.
# It has been modified from its original forms to accommodate minor architectural
# differences compared to LLaMA used by the Swiss AI Initiative that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Apertus model."""
import unittest
from transformers import is_torch_available
from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
slow,
)
from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
if is_torch_available():
from transformers import (
ApertusForCausalLM,
ApertusModel,
)
class ApertusModelTester(CausalLMModelTester):
if is_torch_available():
base_model_class = ApertusModel
def __init__(self, parent):
super().__init__(parent=parent)
# NOTE(3outeille): must be 0.0 for TP backward tests. In train mode, non-zero dropout causes
# different RNG states between the non-TP and TP model forward passes (they run sequentially),
# leading to different dropout masks and mismatched losses.
self.attention_probs_dropout_prob = 0.0
@require_torch
class ApertusModelTest(CausalLMModelTest, unittest.TestCase):
model_tester_class = ApertusModelTester
# Need to use `0.8` instead of `0.9` for `test_cpu_offload`
# This is because we are hitting edge cases with the causal_mask buffer
model_split_percents = [0.5, 0.7, 0.8]
# used in `test_torch_compile_for_training`
_torch_compile_train_cls = ApertusForCausalLM if is_torch_available() else None
@require_torch_accelerator
@slow
class ApertusIntegrationTest(unittest.TestCase):
pass

View File

View File

@@ -0,0 +1,124 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Arcee model."""
import unittest
from pytest import mark
from transformers import AutoTokenizer, is_torch_available
from transformers.testing_utils import (
require_flash_attn,
require_torch,
require_torch_accelerator,
slow,
)
from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
if is_torch_available():
import torch
from transformers import (
ArceeConfig,
ArceeForCausalLM,
ArceeModel,
)
class ArceeModelTester(CausalLMModelTester):
if is_torch_available():
base_model_class = ArceeModel
@require_torch
class ArceeModelTest(CausalLMModelTest, unittest.TestCase):
model_tester_class = ArceeModelTester
# Need to use `0.8` instead of `0.9` for `test_cpu_offload`
# This is because we are hitting edge cases with the causal_mask buffer
model_split_percents = [0.5, 0.7, 0.8]
# used in `test_torch_compile_for_training`
_torch_compile_train_cls = ArceeForCausalLM if is_torch_available() else None
def test_arcee_mlp_uses_relu_squared(self):
"""Test that ArceeMLP uses ReLU² activation instead of SiLU."""
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
config.hidden_act = "relu2" # Ensure we're using relu2 activation
model = ArceeModel(config)
# Check that the MLP layers use the correct activation
mlp = model.layers[0].mlp
# Test with a simple input
x = torch.randn(1, 10, config.hidden_size)
up_output = mlp.up_proj(x)
# Verify ReLU² activation: x * relu(x)
expected_activation = up_output * torch.relu(up_output)
actual_activation = mlp.act_fn(up_output)
self.assertTrue(torch.allclose(expected_activation, actual_activation, atol=1e-5))
@require_torch_accelerator
class ArceeIntegrationTest(unittest.TestCase):
def tearDown(self):
import gc
gc.collect()
torch.cuda.empty_cache()
@slow
def test_model_from_pretrained(self):
# This test would be enabled once a pretrained model is available
# For now, we just test that the model can be instantiated
config = ArceeConfig()
model = ArceeForCausalLM(config)
self.assertIsInstance(model, ArceeForCausalLM)
@mark.skip(reason="Model is not currently public - will update test post release")
@slow
def test_model_generation(self):
EXPECTED_TEXT_COMPLETION = (
"""Once upon a time,In a village there was a farmer who had three sons. The farmer was very old and he"""
)
prompt = "Once upon a time"
tokenizer = AutoTokenizer.from_pretrained("arcee-ai/model-id")
model = ArceeForCausalLM.from_pretrained("arcee-ai/model-id", device_map="auto")
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
generated_ids = model.generate(input_ids, max_new_tokens=20)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
@mark.skip(reason="Model is not currently public - will update test post release")
@slow
@require_flash_attn
@mark.flash_attn_test
def test_model_generation_flash_attn(self):
EXPECTED_TEXT_COMPLETION = (
" the food, the people, and the overall experience. I would definitely recommend this place to others."
)
prompt = "This is a nice place. " * 1024 + "I really enjoy the scenery,"
tokenizer = AutoTokenizer.from_pretrained("arcee-ai/model-id")
model = ArceeForCausalLM.from_pretrained(
"arcee-ai/model-id", device_map="auto", attn_implementation="flash_attention_2", dtype="auto"
)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
generated_ids = model.generate(input_ids, max_new_tokens=20)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text[len(prompt) :])

View File

View File

@@ -0,0 +1,304 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.image_utils import PILImageResampling
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin
if is_vision_available():
from PIL import Image
if is_torch_available():
import torch
class AriaImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_images=1,
min_resolution=30,
max_resolution=40,
size=None,
max_image_size=980,
min_image_size=336,
split_resolutions=None,
split_image=True,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_convert_rgb=True,
resample=PILImageResampling.BICUBIC,
):
self.size = size if size is not None else {"longest_edge": max_resolution}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.num_images = num_images
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.resample = resample
self.max_image_size = max_image_size
self.min_image_size = min_image_size
self.split_resolutions = split_resolutions if split_resolutions is not None else [[980, 980]]
self.split_image = split_image
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"max_image_size": self.max_image_size,
"min_image_size": self.min_image_size,
"split_resolutions": self.split_resolutions,
"split_image": self.split_image,
"do_convert_rgb": self.do_convert_rgb,
"do_normalize": self.do_normalize,
"resample": self.resample,
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to AriaImageProcessor,
assuming do_resize is set to True. The expected size in that case the max image size.
"""
return self.max_image_size, self.max_image_size
def expected_output_image_shape(self, images):
height, width = self.get_expected_values(images, batched=True)
return self.num_channels, height, width
def prepare_image_inputs(
self,
batch_size=None,
min_resolution=None,
max_resolution=None,
num_channels=None,
num_images=None,
size_divisor=None,
equal_resolution=False,
numpify=False,
torchify=False,
):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
One can specify whether the images are of the same resolution or not.
"""
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
batch_size = batch_size if batch_size is not None else self.batch_size
min_resolution = min_resolution if min_resolution is not None else self.min_resolution
max_resolution = max_resolution if max_resolution is not None else self.max_resolution
num_channels = num_channels if num_channels is not None else self.num_channels
num_images = num_images if num_images is not None else self.num_images
images_list = []
for i in range(batch_size):
images = []
for j in range(num_images):
if equal_resolution:
width = height = max_resolution
else:
# To avoid getting image width/height 0
if size_divisor is not None:
# If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
min_resolution = max(size_divisor, min_resolution)
width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
images_list.append(images)
if not numpify and not torchify:
# PIL expects the channel dimension as last dimension
images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
if torchify:
images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
if numpify:
# Numpy images are typically in channels last format
images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list]
return images_list
@require_torch
@require_vision
class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = AriaImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "max_image_size"))
self.assertTrue(hasattr(image_processing, "min_image_size"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "split_image"))
def test_call_numpy(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
for image in sample_images:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_numpy_4_channels(self):
# Aria always processes images as RGB, so it always returns images with 3 channels
for image_processing_class in self.image_processing_classes.values():
image_processor_dict = self.image_processor_dict
image_processing = image_processing_class(**image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for sample_images in image_inputs:
for image in sample_images:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pil(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
def test_call_pytorch(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
for images in image_inputs:
for image in images:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
def test_pad_for_patching(self):
for backend_name, image_processing_class in self.image_processing_classes.items():
numpify = backend_name == "pil"
torchify = backend_name == "torchvision"
image_processing = image_processing_class(**self.image_processor_dict)
# Create odd-sized images
image_input = self.image_processor_tester.prepare_image_inputs(
batch_size=1,
max_resolution=400,
num_images=1,
equal_resolution=True,
numpify=numpify,
torchify=torchify,
)[0][0]
self.assertIn(image_input.shape, [(3, 400, 400), (400, 400, 3)])
# Both backends use channels-first internally; transpose if numpify returned HWC
if numpify:
image_input = image_input.transpose(2, 0, 1)
# Test odd-width
image_shape = (400, 601)
encoded_images = image_processing._pad_for_patching(image_input, image_shape)
self.assertEqual(encoded_images.shape[-2:], image_shape)
# Test odd-height
image_shape = (503, 400)
encoded_images = image_processing._pad_for_patching(image_input, image_shape)
self.assertEqual(encoded_images.shape[-2:], image_shape)
def test_get_num_patches_without_images(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
self.assertEqual(num_patches, 1)
num_patches = image_processing.get_number_of_image_patches(
height=300, width=500, images_kwargs={"split_image": True}
)
self.assertEqual(num_patches, 1)
num_patches = image_processing.get_number_of_image_patches(
height=100, width=100, images_kwargs={"split_image": True, "max_image_size": 200}
)
self.assertEqual(num_patches, 19)

View File

@@ -0,0 +1,522 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Aria model."""
import unittest
import pytest
import requests
from transformers import (
AriaConfig,
AriaForConditionalGeneration,
AriaModel,
AriaTextConfig,
AutoProcessor,
AutoTokenizer,
BitsAndBytesConfig,
is_torch_available,
is_vision_available,
)
from transformers.models.idefics3 import Idefics3VisionConfig
from transformers.testing_utils import (
Expectations,
cleanup,
require_bitsandbytes,
require_torch,
require_torch_large_accelerator,
require_vision,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
# Used to be https://aria-vl.github.io/static/images/view.jpg but it was removed, llava-vl has the same image
IMAGE_OF_VIEW_URL = "https://llava-vl.github.io/static/images/view.jpg"
class AriaVisionText2TextModelTester:
def __init__(
self,
parent,
batch_size=13,
num_channels=3,
image_size=16,
num_image_tokens=4,
ignore_index=-100,
image_token_index=9,
projector_hidden_act="gelu",
seq_length=7,
vision_feature_select_strategy="default",
vision_feature_layer=-1,
text_config=AriaTextConfig(
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=False,
use_labels=True,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
pad_token_id=1,
hidden_size=32,
intermediate_size=16,
max_position_embeddings=60,
model_type="aria_moe_lm",
moe_intermediate_size=4,
moe_num_experts=3,
moe_topk=2,
num_attention_heads=2,
num_experts_per_tok=3,
num_hidden_layers=2,
num_key_value_heads=2,
rope_theta=5000000,
vocab_size=99,
eos_token_id=2,
head_dim=4,
),
is_training=True,
vision_config=Idefics3VisionConfig(
image_size=16,
patch_size=8,
num_channels=3,
is_training=True,
hidden_size=32,
projection_dim=4,
num_hidden_layers=2,
num_attention_heads=2,
intermediate_size=4,
dropout=0.1,
attention_dropout=0.1,
initializer_range=0.02,
),
):
self.parent = parent
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
self.vision_feature_select_strategy = vision_feature_select_strategy
self.vision_feature_layer = vision_feature_layer
self.text_config = text_config
self.vision_config = vision_config
self.pad_token_id = text_config.pad_token_id
self.eos_token_id = text_config.eos_token_id
self.num_hidden_layers = text_config.num_hidden_layers
self.vocab_size = text_config.vocab_size
self.hidden_size = text_config.hidden_size
self.num_attention_heads = text_config.num_attention_heads
self.is_training = is_training
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.num_image_tokens = num_image_tokens
self.seq_length = seq_length + self.num_image_tokens
self.projector_patch_to_query_dict = {
vision_config.image_size**2 // vision_config.patch_size**2: vision_config.projection_dim
}
def get_config(self):
return AriaConfig(
text_config=self.text_config.to_dict(),
vision_config=self.vision_config.to_dict(),
ignore_index=self.ignore_index,
image_token_index=self.image_token_index,
projector_hidden_act=self.projector_hidden_act,
vision_feature_select_strategy=self.vision_feature_select_strategy,
vision_feature_layer=self.vision_feature_layer,
eos_token_id=self.eos_token_id,
projector_patch_to_query_dict=self.projector_patch_to_query_dict,
)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[
self.batch_size,
self.vision_config.num_channels,
self.vision_config.image_size,
self.vision_config.image_size,
]
)
config = self.get_config()
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
input_ids[input_ids == config.image_token_index] = self.pad_token_id
input_ids[:, : self.num_image_tokens] = config.image_token_index
inputs_dict = {
"pixel_values": pixel_values,
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `AriaForConditionalGeneration`.
"""
all_model_classes = (AriaModel, AriaForConditionalGeneration) if is_torch_available() else ()
_is_composite = True
def setUp(self):
self.model_tester = AriaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=AriaConfig, has_text_modality=False)
@pytest.mark.xfail(
reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm"
)
def test_training_gradient_checkpointing(self):
super().test_training_gradient_checkpointing()
@pytest.mark.xfail(
reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm"
)
def test_training_gradient_checkpointing_use_reentrant_false(self):
super().test_training_gradient_checkpointing_use_reentrant_false()
@pytest.mark.xfail(
reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm"
)
def test_training_gradient_checkpointing_use_reentrant_true(self):
super().test_training_gradient_checkpointing_use_reentrant_true()
SKIP = False
torch_accelerator_module = getattr(torch, torch_device)
memory = 23 # skip on T4 / A10
if hasattr(torch_accelerator_module, "get_device_properties"):
if torch_accelerator_module.get_device_properties(0).total_memory / 1024**3 < memory:
SKIP = True
@unittest.skipIf(SKIP, reason="A10 doesn't have enough GPU memory for this tests")
@require_torch
@slow
class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
cleanup(torch_device, gc_collect=True)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@require_torch_large_accelerator
@require_bitsandbytes
def test_small_model_integration_test(self):
# Let's make sure we test the preprocessing to replace what is used
model = AriaForConditionalGeneration.from_pretrained(
"rhymes-ai/Aria",
quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
)
prompt = "<|img|>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
raw_image = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device, model.dtype)
non_img_tokens = [
109, 3905, 2000, 93415, 4551, 1162, 901, 3894, 970, 2478, 1017, 19312, 2388, 1596, 1809, 970, 5449, 1235,
3333, 93483, 109, 61081, 11984, 14800, 93415
] # fmt: skip
EXPECTED_INPUT_IDS = torch.tensor([[9] * 256 + non_img_tokens]).to(inputs["input_ids"].device)
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
output = model.generate(**inputs, max_new_tokens=20)
decoded_output = self.processor.decode(output[0], skip_special_tokens=True)
expected_output = Expectations(
{
(
"cuda",
None,
): "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,",
(
"rocm",
(9, 5),
): "\n USER: What are the things I should be cautious about when I visit this place?\n ASSISTANT: When you visit this place, you should be cautious about the following things:\n\n- The",
}
).get_expectation()
self.assertEqual(decoded_output, expected_output)
@require_torch_large_accelerator
@require_bitsandbytes
def test_small_model_integration_test_llama_single(self):
# Let's make sure we test the preprocessing to replace what is used
model_id = "rhymes-ai/Aria"
model = AriaForConditionalGeneration.from_pretrained(
model_id,
quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
)
processor = AutoProcessor.from_pretrained(model_id)
prompt = "USER: <|img|>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
raw_image = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(model.device, model.dtype)
output = model.generate(**inputs, max_new_tokens=90, do_sample=False)
EXPECTED_DECODED_TEXT = Expectations(
{
("cuda", (8, 0)): "USER: \n What are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this beautiful location, it's important to be mindful of a few things to ensure both your safety and the preservation of the environment. Firstly, always be cautious when walking on the wooden pier, as it can be slippery, especially during or after rain. Secondly, be aware of the local wildlife and do not feed or disturb them. Lastly, respect the natural surroundings by not littering and sticking to",
("rocm", (9, 5)): "USER: \n What are the things I should be cautious about when I visit this place? ASSISTANT: \n\nWhen visiting this place, you should be cautious about the following:\n\n1. **Weather Conditions**: The weather can be unpredictable, so it's important to check the forecast and dress in layers. Sudden changes in weather can occur, so be prepared for rain or cold temperatures.\n\n2. **Safety on the Dock**: The dock may be slippery, especially when",
}
).get_expectation() # fmt: off
decoded_output = processor.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
self.assertEqual(
decoded_output,
EXPECTED_DECODED_TEXT,
f"Expected: {repr(EXPECTED_DECODED_TEXT)}\nActual: {repr(decoded_output)}",
)
@require_torch_large_accelerator
@require_bitsandbytes
def test_small_model_integration_test_llama_batched(self):
# Let's make sure we test the preprocessing to replace what is used
model_id = "rhymes-ai/Aria"
model = AriaForConditionalGeneration.from_pretrained(
model_id,
quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
)
processor = AutoProcessor.from_pretrained(model_id)
prompts = [
"USER: <|img|>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:",
"USER: <|img|>\nWhat is this? ASSISTANT:",
]
image1 = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
model.device, model.dtype
)
output = model.generate(**inputs, max_new_tokens=20)
EXPECTED_DECODED_TEXT = Expectations(
{
("cuda", None): [
"USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you",
"USER: \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on",
],
("rocm", (9, 5)): [
"USER: \n What are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: \n\nWhen visiting this place, you should be cautious about the weather conditions, as it",
"USER: \n What is this? ASSISTANT: This is a picture of two cats sleeping on a couch. USER: What is the color of",
],
}
).get_expectation()
decoded_output = processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(decoded_output, EXPECTED_DECODED_TEXT)
@require_torch_large_accelerator
@require_bitsandbytes
def test_small_model_integration_test_batch(self):
# Let's make sure we test the preprocessing to replace what is used
model = AriaForConditionalGeneration.from_pretrained(
"rhymes-ai/Aria",
quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
)
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
prompts = [
"USER: <|img|>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
"USER: <|img|>\nWhat is this?\nASSISTANT:",
]
image1 = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
model.device, model.dtype
)
output = model.generate(**inputs, max_new_tokens=20)
EXPECTED_DECODED_TEXT = Expectations({
("cuda", None): [
'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.',
'USER: \nWhat is this?\nASSISTANT: Cats',
],
("rocm", (9, 5)): [
'USER: \n What are the things I should be cautious about when I visit this place? What should I bring with me?\n ASSISTANT: \n\nWhen visiting this place, you should be cautious about the following:\n\n-',
'USER: \n What is this?\n ASSISTANT: This is a picture of two cats sleeping on a couch. The couch is red, and the cats',
],
}).get_expectation() # fmt: skip
decoded_output = self.processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(decoded_output, EXPECTED_DECODED_TEXT)
@require_torch_large_accelerator
@require_bitsandbytes
def test_small_model_integration_test_llama_batched_regression(self):
# Let's make sure we test the preprocessing to replace what is used
model_id = "rhymes-ai/Aria"
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
model = AriaForConditionalGeneration.from_pretrained(
model_id,
quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
)
processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")
prompts = [
"USER: <|img|>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
"USER: <|img|>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <|img|>\nAnd this?\nASSISTANT:",
]
image1 = Image.open(requests.get(IMAGE_OF_VIEW_URL, stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True)
inputs = inputs.to(model.device, model.dtype)
output = model.generate(**inputs, max_new_tokens=20)
EXPECTED_DECODED_TEXT = Expectations({
("cuda", None): ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT: A cat sleeping on a bed.'],
("rocm", (9, 5)): ['USER: \n What are the things I should be cautious about when I visit this place? What should I bring with me?\n ASSISTANT: \n\nWhen visiting this place, you should be cautious about the weather conditions, as it', 'USER: \n What is this?\n ASSISTANT: Two cats lying on a bed!\n USER: \n And this?\n ASSISTANT: A serene lake scene with a wooden dock extending into the water.\n USER: \n']
}).get_expectation() # fmt: skip
decoded_output = processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(decoded_output, EXPECTED_DECODED_TEXT)
@require_torch_large_accelerator
@require_vision
@require_bitsandbytes
def test_batched_generation(self):
# Skip multihead_attn for 4bit because MHA will read the original weight without dequantize.
# See https://github.com/huggingface/transformers/pull/37444#discussion_r2045852538.
model = AriaForConditionalGeneration.from_pretrained(
"rhymes-ai/Aria",
quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
)
processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
image1 = Image.open(requests.get(url1, stream=True).raw)
image2 = Image.open(requests.get(url2, stream=True).raw)
# Create inputs
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": prompt1},
{"type": "image"},
{"type": "text", "text": prompt2},
],
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": prompt3},
],
},
]
prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
images = [[image1, image2], [image2]]
inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(
device=model.device, dtype=model.dtype
)
EXPECTED_OUTPUTS = Expectations(
{
("cpu", None): [
"<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n <image>\n USER: What's the difference of two images?\n ASSISTANT:<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with",
"<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a grassy hill. The alpaca has",
],
("cuda", None): [
"<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n <image>\n USER: What's the difference of two images?\n ASSISTANT:<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with",
"<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a patch of ground with some dry grass. The",
],
("xpu", 3): [
"<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n <image>\n USER: What's the difference of two images?\n ASSISTANT:<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image features a cute, light-colored puppy sitting on a paved surface with",
"<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young alpaca standing on a patch of ground with some dry grass. The",
],
("rocm", (9, 5)): [
"<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n <image>\n USER: What's the difference of two images?\n ASSISTANT:<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The first image shows a cute golden retriever puppy sitting on a paved surface with a stick",
'<|im_start|>user\n<fim_prefix><fim_suffix> <image>\n USER: Describe the image.\n ASSISTANT:<|im_end|>\n <|im_start|>assistant\n The image shows a young llama standing on a patch of ground with some dry grass and dirt. The'
],
}
) # fmt: skip
EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
generate_ids = model.generate(**inputs, max_new_tokens=20)
outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
self.assertListEqual(outputs, EXPECTED_OUTPUT)
def test_tokenizer_integration(self):
model_id = "rhymes-ai/Aria"
slow_tokenizer = AutoTokenizer.from_pretrained(
model_id, bos_token="<|startoftext|>", eos_token="<|endoftext|>", use_fast=False
)
slow_tokenizer.add_tokens("<image>", True)
fast_tokenizer = AutoTokenizer.from_pretrained(
model_id,
bos_token="<|startoftext|>",
eos_token="<|endoftext|>",
from_slow=True,
legacy=False,
)
fast_tokenizer.add_tokens("<image>", True)
prompt = "<|startoftext|><|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>"
EXPECTED_OUTPUT = ['<|startoftext|>', '<', '|', 'im', '_', 'start', '|', '>', 'system', '\n', 'Answer', '▁the', '▁questions', '.<', '|', 'im', '_', 'end', '|', '><', '|', 'im', '_', 'start', '|', '>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<', '|', 'im', '_', 'end', '|', '>'] # fmt: skip
self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
@require_torch_large_accelerator
@require_bitsandbytes
def test_generation_no_images(self):
model_id = "rhymes-ai/Aria"
model = AriaForConditionalGeneration.from_pretrained(
model_id,
quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
)
processor = AutoProcessor.from_pretrained(model_id)
# Prepare inputs with no images
inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)

View File

@@ -0,0 +1,297 @@
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers import AriaProcessor
from transformers.image_utils import load_image
from transformers.testing_utils import require_torch, require_vision
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
@require_torch
@require_vision
class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# NOTE: setUpClass, tearDownClass, and getter methods have been removed.
# They are now automatically handled by ProcessorTesterMixin.
# This test only needs: processor_class = YourProcessor
# Optionally: model_id = "some/model" to load from specific pretrained model
# Optionally: prepare_processor_dict() for custom processor kwargs.
processor_class = AriaProcessor
model_id = "m-ric/Aria_hf_2"
@classmethod
def _setup_test_attributes(cls, processor):
cls.image1 = load_image(
url_to_local_path(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
)
)
cls.image2 = load_image(
url_to_local_path("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
)
cls.image3 = load_image(
url_to_local_path(
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
)
)
cls.bos_token = "<|im_start|>"
cls.eos_token = "<|im_end|>"
cls.image_token = processor.tokenizer.image_token
cls.fake_image_token = "o"
cls.global_img_token = "<|img|>"
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
cls.eos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.eos_token)
cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
cls.padding_token_id = processor.tokenizer.pad_token_id
cls.image_seq_len = 2
@staticmethod
def prepare_processor_dict():
return {
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<fim_prefix><|img|><fim_suffix>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
"size_conversion": {490: 2, 980: 2},
} # fmt: skip
def test_get_num_vision_tokens(self):
"Tests general functionality of the helper used internally in vLLM"
processor = self.get_processor()
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
self.assertTrue("num_image_tokens" in output)
self.assertEqual(len(output["num_image_tokens"]), 3)
self.assertTrue("num_image_patches" in output)
self.assertEqual(len(output["num_image_patches"]), 3)
def test_process_interleaved_images_prompts_image_splitting(self):
processor = self.get_processor()
processor.image_processor.split_image = True
# Test that a single image is processed correctly
inputs = processor(images=self.image1, text="Ok<|img|>", images_kwargs={"split_image": True})
self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 3, 980, 980))
self.assertEqual(np.array(inputs["pixel_mask"]).shape, (2, 980, 980))
def test_process_interleaved_images_prompts_no_image_splitting(self):
processor = self.get_processor()
processor.image_processor.split_image = False
# Test that a single image is processed correctly
inputs = processor(images=self.image1, text="Ok<|img|>")
image1_expected_size = (980, 980)
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 3, *image1_expected_size))
self.assertEqual(np.array(inputs["pixel_mask"]).shape, (1, *image1_expected_size))
# fmt: on
# Test a single sample with image and text
image_str = "<|img|>"
text_str = "In this image, we see"
text = image_str + text_str
inputs = processor(text=text, images=self.image1)
# fmt: off
# The processor expands <|img|> to <|img|><|img|> (image_seq_len=2) before tokenization
# So we need to tokenize the full expanded string to match what the processor does
expanded_text = self.image_token * self.image_seq_len + text_str
expected_input_ids = [processor.tokenizer(expanded_text, add_special_tokens=False)["input_ids"]]
# self.assertEqual(len(inputs["input_ids"]), len(expected_input_ids))
self.assertEqual(inputs["input_ids"], expected_input_ids)
self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 3, *image1_expected_size))
self.assertEqual(np.array(inputs["pixel_mask"]).shape, (1, *image1_expected_size))
# fmt: on
# Test that batch is correctly processed
image_str = "<|img|>"
text_str_1 = "In this image, we see"
text_str_2 = "In this image, we see"
text = [
image_str + text_str_1,
image_str + image_str + text_str_2,
]
images = [[self.image1], [self.image2, self.image3]]
inputs = processor(text=text, images=images, padding=True)
# fmt: off
tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
image_tokens = [self.image_token_id] * self.image_seq_len
expected_input_ids_1 = image_tokens + tokenized_sentence_1["input_ids"]
expected_input_ids_2 = 2 * image_tokens + tokenized_sentence_2["input_ids"]
# Pad the first input to match the second input
pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
expected_attention_mask = [ [0] * pad_len + [1] * len(expected_input_ids_1), [1] * (len(expected_input_ids_2))]
self.assertEqual(
inputs["attention_mask"],
expected_attention_mask
)
self.assertEqual(np.array(inputs['pixel_values']).shape, (3, 3, 980, 980))
self.assertEqual(np.array(inputs['pixel_mask']).shape, (3, 980, 980))
# fmt: on
def test_non_nested_images_with_batched_text(self):
processor = self.get_processor()
processor.image_processor.do_image_splitting = False
image_str = "<|img|>"
text_str_1 = "In this image, we see"
text_str_2 = "In this image, we see"
text = [
image_str + text_str_1,
image_str + image_str + text_str_2,
]
images = [self.image1, self.image2, self.image3]
inputs = processor(text=text, images=images, padding=True)
self.assertEqual(np.array(inputs["pixel_values"]).shape, (3, 3, 980, 980))
self.assertEqual(np.array(inputs["pixel_mask"]).shape, (3, 980, 980))
def test_apply_chat_template(self):
# Message contains content which a mix of lists with images and image urls and string
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What do these images show?"},
{"type": "image"},
{"type": "image"},
"What do these images show?",
],
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.",
}
],
},
{"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
]
processor = self.get_processor()
# Make short sequence length to test that the fake tokens are added correctly
rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
print(rendered)
expected_rendered = """<|im_start|>user
What do these images show?<fim_prefix><|img|><fim_suffix><fim_prefix><|img|><fim_suffix><|im_end|>
<|im_start|>assistant
The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<|im_end|>
<|im_start|>user
And who is that?<|im_end|>
<|im_start|>assistant
"""
self.assertEqual(rendered, expected_rendered)
def test_image_chat_template_accepts_processing_kwargs(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
messages = [
[
{
"role": "user",
"content": [
{"type": "text", "text": "What is shown in this image?"},
],
},
]
]
formatted_prompt_tokenized = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
processor_kwargs={
"padding": "max_length",
"max_length": 50,
},
)
self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
formatted_prompt_tokenized = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
processor_kwargs={"max_length": 5, "truncation": True},
)
self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
# Now test the ability to return dict
messages[0][0]["content"].append(
{
"type": "image",
"url": url_to_local_path(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
),
}
)
out_dict = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
processor_kwargs={"max_image_size": 980},
)
self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=3,
)

View File

@@ -0,0 +1,221 @@
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import random
import tempfile
import unittest
import numpy as np
from transformers import ASTFeatureExtractor
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
from transformers.utils.import_utils import is_torch_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
global_rng = random.Random()
if is_torch_available():
import torch
# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
class ASTFeatureExtractionTester:
def __init__(
self,
parent,
batch_size=7,
min_seq_length=400,
max_seq_length=2000,
feature_size=1,
padding_value=0.0,
sampling_rate=16000,
return_attention_mask=True,
do_normalize=True,
):
self.parent = parent
self.batch_size = batch_size
self.min_seq_length = min_seq_length
self.max_seq_length = max_seq_length
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
self.feature_size = feature_size
self.padding_value = padding_value
self.sampling_rate = sampling_rate
self.return_attention_mask = return_attention_mask
self.do_normalize = do_normalize
def prepare_feat_extract_dict(self):
return {
"feature_size": self.feature_size,
"padding_value": self.padding_value,
"sampling_rate": self.sampling_rate,
"return_attention_mask": self.return_attention_mask,
"do_normalize": self.do_normalize,
}
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
def _flatten(list_of_lists):
return list(itertools.chain(*list_of_lists))
if equal_length:
speech_inputs = floats_list((self.batch_size, self.max_seq_length))
else:
# make sure that inputs increase in size
speech_inputs = [
_flatten(floats_list((x, self.feature_size)))
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
]
if numpify:
speech_inputs = [np.asarray(x) for x in speech_inputs]
return speech_inputs
@require_torch
@require_torchaudio
class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = ASTFeatureExtractor
def setUp(self):
self.feat_extract_tester = ASTFeatureExtractionTester(self)
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# Test not batched input
encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
# Test batched
encoded_sequences_1 = feat_extract(speech_inputs, padding=True, return_tensors="np").input_values
encoded_sequences_2 = feat_extract(np_speech_inputs, padding=True, return_tensors="np").input_values
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Test 2-D numpy arrays are batched.
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
np_speech_inputs = np.asarray(speech_inputs)
encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
@require_torch
def test_double_precision_pad(self):
import torch
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
np_speech_inputs = np.random.rand(100).astype(np.float64)
py_speech_inputs = np_speech_inputs.tolist()
for inputs in [py_speech_inputs, np_speech_inputs]:
np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
self.assertTrue(np_processed.input_values.dtype == np.float32)
pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_values.dtype == torch.float32)
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples]
@require_torch
def test_integration(self):
# fmt: off
EXPECTED_INPUT_VALUES = torch.tensor(
[-0.9894, -1.2776, -0.9066, -1.2776, -0.9349, -1.2609, -1.0386, -1.2776,
-1.1561, -1.2776, -1.2052, -1.2723, -1.2190, -1.2132, -1.2776, -1.1133,
-1.1953, -1.1343, -1.1584, -1.2203, -1.1770, -1.2474, -1.2381, -1.1936,
-0.9270, -0.8317, -0.8049, -0.7706, -0.7565, -0.7869]
)
# fmt: on
input_speech = self._load_datasamples(1)
feature_extractor = ASTFeatureExtractor()
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
self.assertEqual(input_values.shape, (1, 1024, 128))
torch.testing.assert_close(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, rtol=1e-4, atol=1e-4)
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
self.assertDictEqual(dict_first, dict_second)
def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
self.assertEqual(dict_first, dict_second)
# exact same tests than before, except that we simulate that torchaudio is not available
@require_torch
@unittest.mock.patch(
"transformers.models.audio_spectrogram_transformer.feature_extraction_audio_spectrogram_transformer.is_speech_available",
lambda: False,
)
class ASTFeatureExtractionWithoutTorchaudioTest(ASTFeatureExtractionTest):
def test_using_audio_utils(self):
# Tests that it uses audio_utils instead of torchaudio
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
self.assertTrue(hasattr(feat_extract, "window"))
self.assertTrue(hasattr(feat_extract, "mel_filters"))
from transformers.models.audio_spectrogram_transformer.feature_extraction_audio_spectrogram_transformer import (
is_speech_available,
)
self.assertFalse(is_speech_available())

View File

@@ -0,0 +1,267 @@
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Audio Spectrogram Transformer (AST) model."""
import inspect
import unittest
from functools import cached_property
from huggingface_hub import hf_hub_download
from transformers import ASTConfig
from transformers.testing_utils import require_torch, require_torchaudio, slow, torch_device
from transformers.utils import is_torch_available, is_torchaudio_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from torch import nn
from transformers import ASTForAudioClassification, ASTModel
if is_torchaudio_available():
import torchaudio
from transformers import ASTFeatureExtractor
class ASTModelTester:
def __init__(
self,
parent,
batch_size=13,
patch_size=2,
max_length=24,
num_mel_bins=16,
is_training=True,
use_labels=True,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
type_sequence_label_size=10,
initializer_range=0.02,
scope=None,
frequency_stride=2,
time_stride=2,
attn_implementation="eager",
):
self.parent = parent
self.batch_size = batch_size
self.patch_size = patch_size
self.max_length = max_length
self.num_mel_bins = num_mel_bins
self.is_training = is_training
self.use_labels = use_labels
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.scope = scope
self.frequency_stride = frequency_stride
self.time_stride = time_stride
self.attn_implementation = attn_implementation
# in AST, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
frequency_out_dimension = (self.num_mel_bins - self.patch_size) // self.frequency_stride + 1
time_out_dimension = (self.max_length - self.patch_size) // self.time_stride + 1
num_patches = frequency_out_dimension * time_out_dimension
self.seq_length = num_patches + 2
def prepare_config_and_inputs(self):
input_values = floats_tensor([self.batch_size, self.max_length, self.num_mel_bins])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
config = self.get_config()
return config, input_values, labels
def get_config(self):
return ASTConfig(
patch_size=self.patch_size,
max_length=self.max_length,
num_mel_bins=self.num_mel_bins,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
is_decoder=False,
initializer_range=self.initializer_range,
frequency_stride=self.frequency_stride,
time_stride=self.time_stride,
attn_implementation=self.attn_implementation,
)
def create_and_check_model(self, config, input_values, labels):
model = ASTModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_values)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_values,
labels,
) = config_and_inputs
inputs_dict = {"input_values": input_values}
return config, inputs_dict
@require_torch
class ASTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as AST does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (
(
ASTModel,
ASTForAudioClassification,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{"audio-classification": ASTForAudioClassification, "feature-extraction": ASTModel}
if is_torch_available()
else {}
)
test_resize_embeddings = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self,
pipeline_test_case_name,
config_class,
model_architecture,
tokenizer_name,
image_processor_name,
feature_extractor_name,
processor_name,
):
if pipeline_test_case_name == "AudioClassificationPipelineTests":
return True
return False
def setUp(self):
self.model_tester = ASTModelTester(self)
self.config_tester = ConfigTester(self, config_class=ASTConfig, has_text_modality=False, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="AST does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_get_set_embeddings(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["input_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
model = ASTModel.from_pretrained(model_name)
self.assertIsNotNone(model)
# We will verify our results on some audio from AudioSet
def prepare_audio():
filepath = hf_hub_download(
repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
)
audio, sampling_rate = torchaudio.load(filepath)
return audio, sampling_rate
@require_torch
@require_torchaudio
class ASTModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return (
ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
if is_torchaudio_available()
else None
)
@slow
def test_inference_audio_classification(self):
feature_extractor = self.default_feature_extractor
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(torch_device)
feature_extractor = self.default_feature_extractor
audio, sampling_rate = prepare_audio()
audio = audio.squeeze().numpy()
inputs = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 527))
self.assertEqual(outputs.logits.shape, expected_shape)
expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602]).to(torch_device)
torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)

View File

View File

@@ -0,0 +1,215 @@
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch AudioFlamingo3 model."""
import json
import unittest
from pathlib import Path
from transformers import (
AudioFlamingo3Config,
AudioFlamingo3EncoderConfig,
AudioFlamingo3ForConditionalGeneration,
AudioFlamingo3Model,
AutoProcessor,
Qwen2Config,
is_torch_available,
)
from transformers.testing_utils import (
cleanup,
require_torch,
slow,
torch_device,
)
from ...alm_tester import ALMModelTest, ALMModelTester
if is_torch_available():
import torch
class AudioFlamingo3ModelTester(ALMModelTester):
config_class = AudioFlamingo3Config
base_model_class = AudioFlamingo3Model
conditional_generation_class = AudioFlamingo3ForConditionalGeneration
text_config_class = Qwen2Config
audio_config_class = AudioFlamingo3EncoderConfig
audio_mask_key = "input_features_mask"
def __init__(self, parent, **kwargs):
# feat_seq_length → (L-1)//2+1 after conv2 → (·-2)//2+1 after avg_pool, so
# feat_seq_length=60 gives 15 audio embed tokens (fits inside seq_length=32 + BOS + text).
kwargs.setdefault("feat_seq_length", 60)
# Encoder adds a learned positional embedding of size max_source_positions to post-conv2 features,
# so it must equal (feat_seq_length - 1) // 2 + 1.
kwargs.setdefault("max_source_positions", (kwargs["feat_seq_length"] - 1) // 2 + 1)
super().__init__(parent, **kwargs)
def get_audio_embeds_mask(self, audio_mask):
# Mirrors AudioFlamingo3Encoder._get_feat_extract_output_lengths:
# conv2 (k=3,s=2,p=1) then avg_pool (k=2,s=2).
input_lengths = audio_mask.sum(-1)
input_lengths = (input_lengths - 1) // 2 + 1
output_lengths = (input_lengths - 2) // 2 + 1
max_len = int(output_lengths.max().item())
positions = torch.arange(max_len, device=audio_mask.device)[None, :]
return (positions < output_lengths[:, None]).long()
@require_torch
class AudioFlamingo3ForConditionalGenerationModelTest(ALMModelTest, unittest.TestCase):
"""
Model tester for `AudioFlamingo3ForConditionalGeneration`.
"""
model_tester_class = AudioFlamingo3ModelTester
# TODO: @eustlb, this is incorrect
pipeline_model_mapping = (
{
"text-to-speech": AudioFlamingo3ForConditionalGeneration,
"audio-text-to-text": AudioFlamingo3ForConditionalGeneration,
}
if is_torch_available()
else {}
)
@unittest.skip(
reason="This test does not apply to AudioFlamingo3 since inputs_embeds corresponding to audio tokens "
"are replaced when input features are provided."
)
def test_inputs_embeds_matches_input_ids(self):
pass
@require_torch
class AudioFlamingo3ForConditionalGenerationIntegrationTest(unittest.TestCase):
"""
Slow tests against the public checkpoint to validate processor-model alignment and in-place fusion.
"""
@classmethod
def setUp(cls):
cleanup(torch_device, gc_collect=True)
cls.checkpoint = "nvidia/audio-flamingo-3-hf"
cls.processor = AutoProcessor.from_pretrained(cls.checkpoint)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@slow
def test_fixture_single_matches(self):
"""
reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/c979f0f1a2b9223fa137faf1c02022d4#file-reproducer-py
"""
path = Path(__file__).parent.parent.parent / "fixtures/audioflamingo3/expected_results_single.json"
with open(path, "r", encoding="utf-8") as f:
raw = json.load(f)
exp_ids = torch.tensor(raw["token_ids"])
exp_txt = raw["transcriptions"]
conversation = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is surprising about the relationship between the barking and the music?",
},
{
"type": "audio",
"path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
},
],
}
]
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
).eval()
batch = self.processor.apply_chat_template(
conversation, tokenize=True, add_generation_prompt=True, return_dict=True
).to(model.device, dtype=model.dtype)
seq = model.generate(**batch)
inp_len = batch["input_ids"].shape[1]
gen_ids = seq[:, inp_len:] if seq.shape[1] >= inp_len else seq
torch.testing.assert_close(gen_ids.cpu(), exp_ids)
txt = self.processor.decode(gen_ids, skip_special_tokens=True)
self.assertListEqual(txt, exp_txt)
@slow
def test_fixture_batched_matches(self):
"""
reproducer (creates JSON directly in repo): https://gist.github.com/ebezzam/c979f0f1a2b9223fa137faf1c02022d4#file-reproducer-py
"""
path = Path(__file__).parent.parent.parent / "fixtures/audioflamingo3/expected_results_batched.json"
with open(path, "r", encoding="utf-8") as f:
raw = json.load(f)
exp_ids = torch.tensor(raw["token_ids"])
exp_txt = raw["transcriptions"]
conversations = [
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is surprising about the relationship between the barking and the music?",
},
{
"type": "audio",
"path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
},
],
}
],
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Why is the philosopher's name mentioned in the lyrics? "
"(A) To express a sense of nostalgia "
"(B) To indicate that language cannot express clearly, satirizing the inversion of black and white in the world "
"(C) To add depth and complexity to the lyrics "
"(D) To showcase the wisdom and influence of the philosopher",
},
{
"type": "audio",
"path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
},
],
}
],
]
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
self.checkpoint, device_map=torch_device, dtype=torch.bfloat16
).eval()
batch = self.processor.apply_chat_template(
conversations, tokenize=True, add_generation_prompt=True, return_dict=True
).to(model.device, dtype=model.dtype)
seq = model.generate(**batch)
inp_len = batch["input_ids"].shape[1]
gen_ids = seq[:, inp_len:] if seq.shape[1] >= inp_len else seq
torch.testing.assert_close(gen_ids.cpu(), exp_ids)
txt = self.processor.decode(gen_ids, skip_special_tokens=True)
self.assertListEqual(txt, exp_txt)

View File

@@ -0,0 +1,191 @@
# Copyright 2025 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
from parameterized import parameterized
from transformers import (
AudioFlamingo3Processor,
AutoProcessor,
AutoTokenizer,
WhisperFeatureExtractor,
)
from transformers.testing_utils import require_librosa, require_torch, require_torchaudio
from ...test_processing_common import MODALITY_INPUT_DATA, ProcessorTesterMixin
class AudioFlamingo3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = AudioFlamingo3Processor
@classmethod
@require_torch
@require_torchaudio
def setUpClass(cls):
cls.checkpoint = "nvidia/audio-flamingo-3-hf"
cls.tmpdirname = tempfile.mkdtemp()
processor = AudioFlamingo3Processor.from_pretrained(cls.checkpoint)
processor.save_pretrained(cls.tmpdirname)
@require_torch
@require_torchaudio
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@require_torch
@require_torchaudio
def get_audio_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).audio_processor
@require_torch
@require_torchaudio
def get_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
@require_torch
@require_torchaudio
def test_can_load_various_tokenizers(self):
processor = AudioFlamingo3Processor.from_pretrained(self.checkpoint)
tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
@require_torch
@require_torchaudio
def test_save_load_pretrained_default(self):
tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
processor = AudioFlamingo3Processor.from_pretrained(self.checkpoint)
feature_extractor = processor.feature_extractor
processor = AudioFlamingo3Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
with tempfile.TemporaryDirectory() as tmpdir:
processor.save_pretrained(tmpdir)
reloaded = AudioFlamingo3Processor.from_pretrained(tmpdir)
self.assertEqual(reloaded.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertEqual(reloaded.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(reloaded.feature_extractor, WhisperFeatureExtractor)
@require_torch
@require_torchaudio
def test_tokenizer_integration(self):
slow_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, use_fast=False)
fast_tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, from_slow=True, legacy=False)
prompt = (
"<|im_start|>system\nAnswer the questions.<|im_end|>"
"<|im_start|>user\n<sound>What is it?<|im_end|>"
"<|im_start|>assistant\n"
)
EXPECTED_OUTPUT = [
"<|im_start|>",
"system",
"Ċ",
"Answer",
"Ġthe",
"Ġquestions",
".",
"<|im_end|>",
"<|im_start|>",
"user",
"Ċ",
"<sound>",
"What",
"Ġis",
"Ġit",
"?",
"<|im_end|>",
"<|im_start|>",
"assistant",
"Ċ",
]
self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
@require_torch
@require_torchaudio
def test_chat_template(self):
processor = AutoProcessor.from_pretrained(self.checkpoint)
expected_prompt = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n<sound>What is surprising about the relationship between the barking and the music?<|im_end|>\n"
"<|im_start|>assistant\n"
)
conversations = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is surprising about the relationship between the barking and the music?",
},
{
"type": "audio",
"path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
},
],
}
]
formatted = processor.tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=True)
self.assertEqual(expected_prompt, formatted)
@require_torch
@require_torchaudio
def test_apply_transcription_request_single(self):
processor = AutoProcessor.from_pretrained(self.checkpoint)
audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav"
helper_outputs = processor.apply_transcription_request(audio=audio_url)
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "Transcribe the input speech."},
{"type": "audio", "audio": audio_url},
],
}
]
manual_outputs = processor.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
)
for key in ("input_ids", "attention_mask", "input_features", "input_features_mask"):
self.assertIn(key, helper_outputs)
self.assertTrue(helper_outputs[key].equal(manual_outputs[key]))
# Overwrite to remove skip numpy inputs (still need to keep as many cases as parent)
@require_librosa
@parameterized.expand([(1, "np"), (1, "pt"), (2, "np"), (2, "pt")])
def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
if return_tensors == "np":
self.skipTest("AudioFlamingo3 only supports PyTorch tensors")
self._test_apply_chat_template(
"audio", batch_size, return_tensors, "audio_input_name", "feature_extractor", MODALITY_INPUT_DATA["audio"]
)

View File

View File

@@ -0,0 +1,163 @@
# Copyright 2019-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
import transformers
import transformers.models.auto
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
from transformers.models.bert.configuration_bert import BertConfig
from transformers.models.roberta.configuration_roberta import RobertaConfig
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
SAMPLE_ROBERTA_CONFIG = get_tests_dir("fixtures/dummy-config.json")
class AutoConfigTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_module_spec(self):
self.assertIsNotNone(transformers.models.auto.__spec__)
self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto"))
def test_config_from_model_shortcut(self):
config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
self.assertIsInstance(config, BertConfig)
def test_config_model_type_from_local_file(self):
config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG)
self.assertIsInstance(config, RobertaConfig)
def test_config_model_type_from_model_identifier(self):
config = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
self.assertIsInstance(config, RobertaConfig)
def test_config_for_model_str(self):
config = AutoConfig.for_model("roberta")
self.assertIsInstance(config, RobertaConfig)
def test_new_config_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
# Wrong model type will raise an error
with self.assertRaises(ValueError):
AutoConfig.register("model", CustomConfig)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoConfig.register("bert", BertConfig)
# Now that the config is registered, it can be used as any other config with the auto-API
config = CustomConfig()
with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(tmp_dir)
new_config = AutoConfig.from_pretrained(tmp_dir)
self.assertIsInstance(new_config, CustomConfig)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
):
_ = AutoConfig.from_pretrained("bert-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_from_pretrained_dynamic_config(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(config.__class__.__name__, "NewModelConfig")
# Test the dynamic module is loaded only once.
reloaded_config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertIs(config.__class__, reloaded_config.__class__)
# Test config can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(tmp_dir)
reloaded_config = AutoConfig.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "configuration.py"))) # Assert we saved config code
# Assert we're pointing at local code and not another remote repo
self.assertEqual(reloaded_config.auto_map["AutoConfig"], "configuration.NewModelConfig")
self.assertEqual(reloaded_config.__class__.__name__, "NewModelConfig")
def test_from_pretrained_dynamic_config_conflict(self):
class NewModelConfigLocal(BertConfig):
model_type = "new-model"
def __init__(self, **kwargs):
super().__init__(**kwargs)
try:
AutoConfig.register("new-model", NewModelConfigLocal)
# If remote code is not set, the default is to use local
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
# If remote code is disabled, we load the local one.
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewModelConfigLocal.__module__ = "transformers.models.new_model.configuration_new_model"
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(config.__class__.__name__, "NewModelConfig")
finally:
if "new-model" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["new-model"]
def test_config_missing_model_type(self):
with tempfile.TemporaryDirectory() as tmp_dir:
config_dict = {
"hidden_size": 768,
"num_attention_heads": 12,
"num_hidden_layers": 12,
}
config_path = os.path.join(tmp_dir, "config.json")
with open(config_path, "w") as f:
json.dump(config_dict, f)
with self.assertRaisesRegex(ValueError, "Should have a `model_type` key"):
AutoConfig.from_pretrained(tmp_dir)

View File

@@ -0,0 +1,196 @@
# Copyright 2021 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
import transformers
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
AutoConfig,
AutoFeatureExtractor,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
)
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_feature_extraction import CustomFeatureExtractor # noqa E402
SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
SAMPLE_FEATURE_EXTRACTION_CONFIG = get_tests_dir("fixtures/dummy_feature_extractor_config.json")
SAMPLE_CONFIG = get_tests_dir("fixtures/dummy-config.json")
class AutoFeatureExtractorTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_feature_extractor_from_model_shortcut(self):
config = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def test_feature_extractor_from_local_directory_from_key(self):
config = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def test_feature_extractor_from_local_directory_from_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = Wav2Vec2Config()
# remove feature_extractor_type to make sure config.json alone is enough to load feature processor locally
config_dict = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR).to_dict()
config_dict.pop("feature_extractor_type")
config = Wav2Vec2FeatureExtractor(**config_dict)
# save in new folder
model_config.save_pretrained(tmpdirname)
config.save_pretrained(tmpdirname)
config = AutoFeatureExtractor.from_pretrained(tmpdirname)
# make sure private variable is not incorrectly saved
dict_as_saved = json.loads(config.to_json_string())
self.assertTrue("_processor_class" not in dict_as_saved)
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def test_feature_extractor_from_local_file(self):
config = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG)
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
):
_ = AutoFeatureExtractor.from_pretrained("bert-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoFeatureExtractor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_feature_extractor_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"Can't load feature extractor for 'hf-internal-testing/config-no-model'.",
):
_ = AutoFeatureExtractor.from_pretrained("hf-internal-testing/config-no-model")
def test_from_pretrained_dynamic_feature_extractor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor"
)
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
# Test the dynamic module is loaded only once.
reloaded_feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertIs(feature_extractor.__class__, reloaded_feature_extractor.__class__)
# Test feature extractor can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
feature_extractor.save_pretrained(tmp_dir)
reloaded_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "feature_extractor.py"))) # Assert we saved code
self.assertEqual(
reloaded_feature_extractor.auto_map["AutoFeatureExtractor"], "feature_extractor.NewFeatureExtractor"
)
self.assertEqual(reloaded_feature_extractor.__class__.__name__, "NewFeatureExtractor")
def test_new_feature_extractor_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoFeatureExtractor.register(Wav2Vec2Config, Wav2Vec2FeatureExtractor)
# Now that the config is registered, it can be used as any other config with the auto-API
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
with tempfile.TemporaryDirectory() as tmp_dir:
feature_extractor.save_pretrained(tmp_dir)
new_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir)
self.assertIsInstance(new_feature_extractor, CustomFeatureExtractor)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_feature_extractor_conflict(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
is_local = True
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
# If remote code is not set, the default is to use local
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor"
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(feature_extractor.is_local)
# If remote code is disabled, we load the local one.
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(feature_extractor.is_local)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(feature_extractor.is_local)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewFeatureExtractor.__module__ = "transformers.models.custom.configuration_custom"
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(not hasattr(feature_extractor, "is_local"))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]

View File

@@ -0,0 +1,371 @@
# Copyright 2021 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
import transformers
from transformers import (
CONFIG_MAPPING,
IMAGE_PROCESSOR_MAPPING,
AutoConfig,
AutoImageProcessor,
CLIPConfig,
CLIPImageProcessor,
ViTImageProcessor,
ViTImageProcessorPil,
)
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torchvision, require_vision
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_image_processing import CustomImageProcessor # noqa E402
class AutoImageProcessorTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
@require_torchvision
def test_image_processor_from_model_shortcut(self):
config = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_local_directory_from_key(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
config = AutoImageProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_local_directory_from_feature_extractor_key(self):
# Ensure we can load the image processor from the feature extractor config
# Though we don't have any `CLIPFeatureExtractor` class, we can't be sure that
# there are no models in the hub serialized with `processor_type=CLIPFeatureExtractor`
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"feature_extractor_type": "CLIPFeatureExtractor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
config = AutoImageProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_new_filename(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
config = AutoImageProcessor.from_pretrained(tmpdirname)
# Now loading fast image processor by default
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_local_directory_from_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = CLIPConfig()
# Create a dummy config file with image_processor_type
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
# remove image_processor_type to make sure config.json alone is enough to load image processor locally
config_dict = AutoImageProcessor.from_pretrained(tmpdirname).to_dict()
config_dict.pop("image_processor_type")
config = CLIPImageProcessor(**config_dict)
# save in new folder
model_config.save_pretrained(tmpdirname)
config.save_pretrained(tmpdirname)
config = AutoImageProcessor.from_pretrained(tmpdirname)
# make sure private variable is not incorrectly saved
dict_as_saved = json.loads(config.to_json_string())
self.assertTrue("_processor_class" not in dict_as_saved)
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_local_file(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump(
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
config = AutoImageProcessor.from_pretrained(processor_tmpfile)
self.assertIsInstance(config, CLIPImageProcessor)
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "clip-base is not a local folder and is not a valid model identifier"
):
_ = AutoImageProcessor.from_pretrained("clip-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoImageProcessor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_image_processor_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"Can't load image processor for 'hf-internal-testing/config-no-model'.",
):
_ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
@require_vision
@require_torchvision
def test_use_fast_selection(self):
checkpoint = "hf-internal-testing/tiny-random-vit"
# Fast image processor is selected by default
image_processor = AutoImageProcessor.from_pretrained(checkpoint)
self.assertIsInstance(image_processor, ViTImageProcessor)
# Fast image processor is selected when use_fast=True
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True)
self.assertIsInstance(image_processor, ViTImageProcessor)
# Slow image processor is selected when use_fast=False
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=False)
self.assertIsInstance(image_processor, ViTImageProcessorPil)
def test_from_pretrained_dynamic_image_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
)
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
# Test the dynamic module is loaded only once.
reloaded_image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertIs(image_processor.__class__, reloaded_image_processor.__class__)
# Test image processor can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
image_processor.save_pretrained(tmp_dir)
reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "image_processor.py"))) # Assert we saved custom code
self.assertEqual(
reloaded_image_processor.auto_map["AutoImageProcessor"], "image_processor.NewImageProcessor"
)
self.assertEqual(reloaded_image_processor.__class__.__name__, "NewImageProcessor")
# Test the dynamic module is reloaded if we force it.
reloaded_image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True, force_download=True
)
self.assertIsNot(image_processor.__class__, reloaded_image_processor.__class__)
def test_new_image_processor_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoImageProcessor.register(CustomConfig, CustomImageProcessor)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoImageProcessor.register(CLIPConfig, CLIPImageProcessor)
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"feature_extractor_type": "CLIPFeatureExtractor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
image_processor = CustomImageProcessor.from_pretrained(tmpdirname)
# Now that the config is registered, it can be used as any other config with the auto-API
with tempfile.TemporaryDirectory() as tmp_dir:
image_processor.save_pretrained(tmp_dir)
new_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
self.assertIsInstance(new_image_processor, CustomImageProcessor)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_image_processor_conflict(self):
class NewImageProcessor(CLIPImageProcessor):
is_local = True
try:
AutoConfig.register("custom", CustomConfig)
AutoImageProcessor.register(CustomConfig, NewImageProcessor)
# If remote code is not set, the default is to use local
image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(image_processor.is_local)
# If remote code is disabled, we load the local one.
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(image_processor.is_local)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(image_processor.is_local)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewImageProcessor.__module__ = "transformers.models.custom.configuration_custom"
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(not hasattr(image_processor, "is_local"))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
@require_vision
def test_backend_kwarg_pil(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "ViTImageProcessor"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="pil")
self.assertIsInstance(image_processor, ViTImageProcessorPil)
@require_torchvision
def test_backend_kwarg_torchvision(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "ViTImageProcessor"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="torchvision")
self.assertIsInstance(image_processor, ViTImageProcessor)
@require_torchvision
def test_default_to_pil_backend_for_lanczos_processors(self):
# Even when torchvision is available, processors that rely on Lanczos interpolation
# (listed in DEFAULT_TO_PIL_BACKEND_IMAGE_PROCESSORS) must default to the PIL backend
# when backend='auto'.
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "FlavaImageProcessor"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname)
self.assertEqual(type(image_processor).__name__, "FlavaImageProcessorPil")
@require_torchvision
def test_explicit_backend_overrides_lanczos_default(self):
# An explicit backend="torchvision" must bypass the DEFAULT_TO_PIL_BACKEND_IMAGE_PROCESSORS
# override; only the auto-resolved backend is affected by the list.
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "FlavaImageProcessor"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="torchvision")
self.assertEqual(type(image_processor).__name__, "FlavaImageProcessor")
@require_torchvision
def test_legacy_fast_class_name_in_config(self):
# Checkpoints saved before the rename used names like "ViTImageProcessorFast".
# The *Fast suffix must be stripped and the correct backend variant returned.
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "ViTImageProcessorFast"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="torchvision")
self.assertIsInstance(image_processor, ViTImageProcessor)
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="pil")
self.assertIsInstance(image_processor, ViTImageProcessorPil)
@require_vision
def test_register_with_image_processor_classes_dict(self):
# New image_processor_classes={} dict API for register().
try:
AutoImageProcessor.register(CustomConfig, image_processor_classes={"pil": CustomImageProcessor})
with tempfile.TemporaryDirectory() as tmp_dir:
json.dump(
{"image_processor_type": "CustomImageProcessor"},
open(Path(tmp_dir) / "preprocessor_config.json", "w"),
)
image_processor = AutoImageProcessor.from_pretrained(tmp_dir, backend="pil")
self.assertIsInstance(image_processor, CustomImageProcessor)
finally:
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
@require_vision
def test_register_legacy_slow_fast_params(self):
# slow_image_processor_class= and fast_image_processor_class= are deprecated but
# must still work; they map to "pil" and "torchvision" backends respectively.
try:
AutoImageProcessor.register(CustomConfig, slow_image_processor_class=CustomImageProcessor)
with tempfile.TemporaryDirectory() as tmp_dir:
json.dump(
{"image_processor_type": "CustomImageProcessor"},
open(Path(tmp_dir) / "preprocessor_config.json", "w"),
)
image_processor = AutoImageProcessor.from_pretrained(tmp_dir, backend="pil")
self.assertIsInstance(image_processor, CustomImageProcessor)
finally:
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]

View File

@@ -0,0 +1,599 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
import os
import sys
import tempfile
import unittest
from collections import OrderedDict
from pathlib import Path
import pytest
import transformers
from transformers import BertConfig, GPT2Model, is_torch_available
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
from transformers.testing_utils import (
DUMMY_UNKNOWN_IDENTIFIER,
RequestCounter,
require_peft,
require_torch,
slow,
)
from transformers.utils import ADAPTER_CONFIG_NAME
from ..bert.test_modeling_bert import BertModelTester
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
if is_torch_available():
import torch
from test_module.custom_modeling import CustomModel
from transformers import (
AutoBackbone,
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForMaskedLM,
AutoModelForPreTraining,
AutoModelForQuestionAnswering,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoModelForTableQuestionAnswering,
AutoModelForTokenClassification,
BertForMaskedLM,
BertForPreTraining,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
BertModel,
FunnelBaseModel,
FunnelModel,
GenerationMixin,
GPT2Config,
GPT2LMHeadModel,
ResNetBackbone,
T5Config,
T5ForConditionalGeneration,
TapasConfig,
TapasForQuestionAnswering,
TimmBackbone,
)
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
MODEL_FOR_PRETRAINING_MAPPING,
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_MAPPING,
)
@require_torch
class AutoModelTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
@slow
def test_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModel.from_pretrained(model_name)
model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertModel)
self.assertEqual(len(loading_info["missing_keys"]), 0)
# When using PyTorch checkpoint, the expected value is `8`. With `safetensors` checkpoint (if it is
# installed), the expected value becomes `7`.
EXPECTED_NUM_OF_UNEXPECTED_KEYS = 7
self.assertEqual(len(loading_info["unexpected_keys"]), EXPECTED_NUM_OF_UNEXPECTED_KEYS)
self.assertEqual(len(loading_info["mismatched_keys"]), 0)
self.assertEqual(len(loading_info["error_msgs"]), 0)
@slow
def test_model_for_pretraining_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForPreTraining.from_pretrained(model_name)
model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForPreTraining)
# Only one value should not be initialized and in the missing keys.
for value in loading_info.values():
self.assertEqual(len(value), 0)
@slow
def test_model_for_causal_lm(self):
model_name = "openai-community/gpt2"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, GPT2Config)
model = AutoModelForCausalLM.from_pretrained(model_name)
model, loading_info = AutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, GPT2LMHeadModel)
@slow
def test_model_for_masked_lm(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForMaskedLM.from_pretrained(model_name)
model, loading_info = AutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForMaskedLM)
@slow
def test_model_for_encoder_decoder_lm(self):
model_name = "google-t5/t5-base"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, T5Config)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model, loading_info = AutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, T5ForConditionalGeneration)
@slow
def test_sequence_classification_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForSequenceClassification)
@slow
def test_question_answering_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForQuestionAnswering)
@slow
def test_table_question_answering_model_from_pretrained(self):
model_name = "google/tapas-base"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, TapasConfig)
model = AutoModelForTableQuestionAnswering.from_pretrained(model_name)
model, loading_info = AutoModelForTableQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TapasForQuestionAnswering)
@slow
def test_token_classification_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model, loading_info = AutoModelForTokenClassification.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForTokenClassification)
@slow
def test_auto_backbone_timm_model_from_pretrained(self):
# Configs can't be loaded for timm models
model = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True)
with pytest.raises(ValueError):
# We can't pass output_loading_info=True as we're loading from timm
AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TimmBackbone)
# Check kwargs are correctly passed to the backbone
model = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, out_indices=(-2, -1))
self.assertEqual(model.out_indices, [-2, -1])
# Check out_features cannot be passed to Timm backbones
with self.assertRaises(ValueError):
_ = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, out_features=["stage1"])
@slow
def test_auto_backbone_from_pretrained(self):
model = AutoBackbone.from_pretrained("microsoft/resnet-18")
model, loading_info = AutoBackbone.from_pretrained("microsoft/resnet-18", output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, ResNetBackbone)
# Check kwargs are correctly passed to the backbone
model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_indices=[-2, -1])
self.assertEqual(model.out_indices, [-2, -1])
self.assertEqual(model.out_features, ["stage3", "stage4"])
model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_features=["stage2", "stage4"])
self.assertEqual(model.out_indices, [2, 4])
self.assertEqual(model.out_features, ["stage2", "stage4"])
def test_from_pretrained_with_tuple_values(self):
# For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
model = AutoModel.from_pretrained("sgugger/funnel-random-tiny")
self.assertIsInstance(model, FunnelModel)
config = copy.deepcopy(model.config)
config.architectures = ["FunnelBaseModel"]
model = AutoModel.from_config(config)
self.assertIsInstance(model, FunnelBaseModel)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
model = AutoModel.from_pretrained(tmp_dir)
self.assertIsInstance(model, FunnelBaseModel)
def test_from_pretrained_dynamic_model_local(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoModel.register(CustomConfig, CustomModel)
config = CustomConfig(hidden_size=32)
model = CustomModel(config)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
new_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in MODEL_MAPPING._extra_content:
del MODEL_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_model_distant(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test the dynamic module is loaded only once.
reloaded_model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertIs(model.__class__, reloaded_model.__class__)
# Test model can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
# Test the dynamic module is reloaded if we force it.
reloaded_model = AutoModel.from_pretrained(
"hf-internal-testing/test_dynamic_model", trust_remote_code=True, force_download=True
)
self.assertIsNot(model.__class__, reloaded_model.__class__)
# This one uses a relative import to a util file, this checks it is downloaded and used properly.
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model_with_util", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test the dynamic module is loaded only once.
reloaded_model = AutoModel.from_pretrained(
"hf-internal-testing/test_dynamic_model_with_util", trust_remote_code=True
)
self.assertIs(model.__class__, reloaded_model.__class__)
# Test model can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
# Test the dynamic module is reloaded if we force it.
reloaded_model = AutoModel.from_pretrained(
"hf-internal-testing/test_dynamic_model_with_util", trust_remote_code=True, force_download=True
)
self.assertIsNot(model.__class__, reloaded_model.__class__)
def test_from_pretrained_dynamic_model_distant_with_ref(self):
model = AutoModel.from_pretrained("hf-internal-testing/ref_to_test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test model can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
# This one uses a relative import to a util file, this checks it is downloaded and used properly.
model = AutoModel.from_pretrained(
"hf-internal-testing/ref_to_test_dynamic_model_with_util", trust_remote_code=True
)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test model can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
def test_from_pretrained_dynamic_model_with_period(self):
# We used to have issues where repos with "." in the name would cause issues because the Python
# import machinery would treat that as a directory separator, so we test that case
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model_v1.0")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model_v1.0", trust_remote_code=False)
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model_v1.0", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test that it works with a custom cache dir too
with tempfile.TemporaryDirectory() as tmp_dir:
with unittest.mock.patch.dict(os.environ, {"HF_XET_CACHE": tmp_dir}):
model = AutoModel.from_pretrained(
"hf-internal-testing/test_dynamic_model_v1.0", trust_remote_code=True, cache_dir=tmp_dir
)
self.assertEqual(model.__class__.__name__, "NewModel")
def test_new_model_registration(self):
AutoConfig.register("custom", CustomConfig)
auto_classes = [
AutoModel,
AutoModelForCausalLM,
AutoModelForMaskedLM,
AutoModelForPreTraining,
AutoModelForQuestionAnswering,
AutoModelForSequenceClassification,
AutoModelForTokenClassification,
]
try:
for auto_class in auto_classes:
with self.subTest(auto_class.__name__):
# Wrong config class will raise an error
with self.assertRaises(ValueError):
auto_class.register(BertConfig, CustomModel)
auto_class.register(CustomConfig, CustomModel)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
auto_class.register(BertConfig, BertModel)
# Now that the config is registered, it can be used as any other config with the auto-API
tiny_config = BertModelTester(self).get_config()
config = CustomConfig(**tiny_config.to_dict())
model = auto_class.from_config(config)
self.assertIsInstance(model, CustomModel)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
new_model = auto_class.from_pretrained(tmp_dir)
# The model is a CustomModel but from the new dynamically imported class.
self.assertIsInstance(new_model, CustomModel)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
for mapping in (
MODEL_MAPPING,
MODEL_FOR_PRETRAINING_MAPPING,
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
):
if CustomConfig in mapping._extra_content:
del mapping._extra_content[CustomConfig]
def test_from_pretrained_dynamic_model_conflict(self):
class NewModelConfigLocal(BertConfig):
model_type = "new-model"
def __init__(self, **kwargs):
super().__init__(**kwargs)
class NewModel(BertModel):
config_class = NewModelConfigLocal
try:
AutoConfig.register("new-model", NewModelConfigLocal)
AutoModel.register(NewModelConfigLocal, NewModel)
# If remote code is not set, the default is to use local
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
# If remote code is disabled, we load the local one.
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewModelConfigLocal.__module__ = "transformers.models.new_model.configuration_new_model"
NewModel.__module__ = "transformers.models.new_model.modeling_new_model"
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.config.__class__.__name__, "NewModelConfig")
finally:
if "new-model" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["new-model"]
if NewModelConfigLocal in MODEL_MAPPING._extra_content:
del MODEL_MAPPING._extra_content[NewModelConfigLocal]
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
):
_ = AutoModel.from_pretrained("bert-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoModel.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
@unittest.skip("Failing on main")
def test_cached_model_has_minimum_calls_to_head(self):
# Make sure we have cached the model.
_ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
with RequestCounter() as counter:
_ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(counter["GET"], 0)
self.assertEqual(counter["HEAD"], 1)
self.assertEqual(counter.total_calls, 1)
# With a sharded checkpoint
_ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
with RequestCounter() as counter:
_ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
self.assertEqual(counter["GET"], 0)
self.assertEqual(counter["HEAD"], 1)
self.assertEqual(counter.total_calls, 1)
def test_attr_not_existing(self):
from transformers.models.auto.auto_factory import _LazyAutoMapping
_CONFIG_MAPPING_NAMES = OrderedDict([("bert", "BertConfig")])
_MODEL_MAPPING_NAMES = OrderedDict([("bert", "GhostModel")])
_MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
with pytest.raises(ValueError, match=r"Could not find GhostModel neither in .* nor in .*!"):
_MODEL_MAPPING[BertConfig]
_MODEL_MAPPING_NAMES = OrderedDict([("bert", "BertModel")])
_MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
self.assertEqual(_MODEL_MAPPING[BertConfig], BertModel)
_MODEL_MAPPING_NAMES = OrderedDict([("bert", "GPT2Model")])
_MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
self.assertEqual(_MODEL_MAPPING[BertConfig], GPT2Model)
def test_custom_model_patched_generation_inheritance(self):
"""
Tests that our inheritance patching for generate-compatible models works as expected. Without this feature,
old Hub models lose the ability to call `generate`.
"""
model = AutoModelForCausalLM.from_pretrained(
"hf-internal-testing/test_dynamic_model_generation", trust_remote_code=True
)
self.assertTrue(model.__class__.__name__ == "NewModelForCausalLM")
# It inherits from GenerationMixin. This means it can `generate`. Because `PreTrainedModel` is scheduled to
# stop inheriting from `GenerationMixin` in v4.50, this check will fail if patching is not present.
self.assertTrue(isinstance(model, GenerationMixin))
# More precisely, it directly inherits from GenerationMixin. This check would fail prior to v4.45 (inheritance
# patching was added in v4.45)
self.assertTrue("GenerationMixin" in str(model.__class__.__bases__))
@unittest.skip("@Cyril: add the post_init() on the hub repo")
def test_model_with_dotted_name_and_relative_imports(self):
"""
Test for issue #40496: AutoModel.from_pretrained() doesn't work for models with '.' in their name
when there's a relative import.
Without the fix, this raises: ModuleNotFoundError:
No module named 'transformers_modules.hf-internal-testing.remote_code_model_with_dots_v1'
"""
model_id = "hf-internal-testing/remote_code_model_with_dots_v1.0"
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
self.assertIsNotNone(model)
@require_peft
def test_adapter_path_not_overwritten_for_complete_model(self):
"""
Test for issue #43746: Only overwrite the pretrained_model_name_or_path if needed with adapter.
This test ensures that when a model has an adapter config and the pretrained_model_name_or_path
points to a model directory with both a base model and an embedded adapter, the path should NOT
be overwritten with the hub model name embedded in the adapter's config.
The bug was that the path was being unconditionally overwritten, which would cause
incorrect behavior when loading models with adapters that are embedded within the
same directory as the base model.
"""
peft_test_model = "peft-internal-testing/tiny-OPTForCausalLM-lora"
transformers_test_model = "hf-internal-testing/tiny-random-OPTForCausalLM"
# Create a temporary directory with a complete adapter model structure
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_dir = Path(tmp_dir)
# Save the model and adapter locally
config = AutoConfig.from_pretrained(transformers_test_model)
model = AutoModel.from_pretrained(transformers_test_model)
adapter_model = AutoModel.from_pretrained(peft_test_model)
config.save_pretrained(tmp_dir)
model.save_pretrained(tmp_dir)
adapter_model.save_pretrained(tmp_dir)
# Overwrite the base_model_name_or_path to an invalid value that
# would cause the load to fail later
adapter_config_path = tmp_dir / ADAPTER_CONFIG_NAME
with open(adapter_config_path, "r") as handle:
adapter_config = json.load(handle)
adapter_config["base_model_name_or_path"] = "some/model/that/does/not/exist"
with open(adapter_config_path, "w") as handle:
json.dump(adapter_config, handle)
# Load from the saved path and make sure it actually loads despite
# the invalid adapter config path
AutoModel.from_pretrained(tmp_dir)

View File

@@ -0,0 +1,673 @@
# Copyright 2021 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
from shutil import copyfile
from huggingface_hub import snapshot_download, upload_folder
import transformers
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
MODEL_FOR_AUDIO_TOKENIZATION_MAPPING,
PROCESSOR_MAPPING,
TOKENIZER_MAPPING,
AutoConfig,
AutoFeatureExtractor,
AutoProcessor,
AutoTokenizer,
BaseVideoProcessor,
BertTokenizer,
CLIPImageProcessor,
FeatureExtractionMixin,
ImageProcessingMixin,
LlamaTokenizer,
LlavaOnevisionVideoProcessor,
LlavaProcessor,
ProcessorMixin,
SiglipImageProcessor,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
)
from transformers.models.auto.feature_extraction_auto import get_feature_extractor_config
from transformers.models.auto.image_processing_auto import get_image_processor_config
from transformers.models.auto.tokenization_auto import REGISTERED_TOKENIZER_CLASSES
from transformers.models.auto.video_processing_auto import get_video_processor_config
from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test
from transformers.tokenization_python import TOKENIZER_CONFIG_FILE
from transformers.utils import (
FEATURE_EXTRACTOR_NAME,
PROCESSOR_NAME,
)
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_feature_extraction import CustomFeatureExtractor # noqa E402
from test_module.custom_processing import CustomProcessor # noqa E402
from test_module.custom_tokenization import CustomTokenizer # noqa E402
SAMPLE_PROCESSOR_CONFIG = get_tests_dir("fixtures/dummy_feature_extractor_config.json")
SAMPLE_VOCAB_LLAMA = get_tests_dir("fixtures/test_sentencepiece.model")
SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
SAMPLE_CONFIG = get_tests_dir("fixtures/config.json")
SAMPLE_PROCESSOR_CONFIG_DIR = get_tests_dir("fixtures")
class AutoFeatureExtractorTest(unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_processor_from_model_shortcut(self):
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_directory_from_repo(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = Wav2Vec2Config()
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
# save in new folder
model_config.save_pretrained(tmpdirname)
processor.save_pretrained(tmpdirname)
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_subfolder_from_repo(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
processor.save_pretrained(f"{tmpdirname}/processor_subfolder")
processor = Wav2Vec2Processor.from_pretrained(tmpdirname, subfolder="processor_subfolder")
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_directory_from_extractor_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
# copy relevant files
copyfile(SAMPLE_PROCESSOR_CONFIG, os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME))
copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json"))
copyfile(SAMPLE_CONFIG, os.path.join(tmpdirname, "config.json"))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_subcomponent_get_config_dict_saved_as_nested_config(self):
"""
Tests that we can get config dict of a subcomponents of a processor,
even if they were saved as nested dict in `processor_config.json`
"""
# Test feature extractor first
with tempfile.TemporaryDirectory() as tmpdirname:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
processor.save_pretrained(tmpdirname)
config_dict_1 = get_feature_extractor_config(tmpdirname)
feature_extractor_1 = Wav2Vec2FeatureExtractor(**config_dict_1)
self.assertIsInstance(feature_extractor_1, Wav2Vec2FeatureExtractor)
config_dict_2, _ = FeatureExtractionMixin.get_feature_extractor_dict(tmpdirname)
feature_extractor_2 = Wav2Vec2FeatureExtractor(**config_dict_2)
self.assertIsInstance(feature_extractor_2, Wav2Vec2FeatureExtractor)
self.assertEqual(config_dict_1, config_dict_2)
# Test image and video processors next
with tempfile.TemporaryDirectory() as tmpdirname:
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
processor.save_pretrained(tmpdirname)
config_dict_1 = get_image_processor_config(tmpdirname)
image_processor_1 = SiglipImageProcessor(**config_dict_1)
self.assertIsInstance(image_processor_1, SiglipImageProcessor)
config_dict_2, _ = ImageProcessingMixin.get_image_processor_dict(tmpdirname)
image_processor_2 = SiglipImageProcessor(**config_dict_2)
self.assertIsInstance(image_processor_2, SiglipImageProcessor)
self.assertEqual(config_dict_1, config_dict_2)
config_dict_1 = get_video_processor_config(tmpdirname)
video_processor_1 = LlavaOnevisionVideoProcessor(**config_dict_1)
self.assertIsInstance(video_processor_1, LlavaOnevisionVideoProcessor)
config_dict_2, _ = BaseVideoProcessor.get_video_processor_dict(tmpdirname)
video_processor_2 = LlavaOnevisionVideoProcessor(**config_dict_2)
self.assertIsInstance(video_processor_2, LlavaOnevisionVideoProcessor)
self.assertEqual(config_dict_1, config_dict_2)
def test_processor_from_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
# save in new folder
processor.save_pretrained(tmpdirname)
if not os.path.isfile(os.path.join(tmpdirname, PROCESSOR_NAME)):
# create one manually in order to perform this test's objective
config_dict = {"processor_class": "Wav2Vec2Processor"}
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as fp:
json.dump(config_dict, fp)
# drop `processor_class` in tokenizer config
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE)) as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f:
f.write(json.dumps(config_dict))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_tokenizer_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
# save in new folder
processor.save_pretrained(tmpdirname)
# drop `processor_class` in processor
with open(os.path.join(tmpdirname, PROCESSOR_NAME)) as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
f.write(json.dumps(config_dict))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_directory_from_model_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = Wav2Vec2Config(processor_class="Wav2Vec2Processor")
model_config.save_pretrained(tmpdirname)
# copy relevant files
copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json"))
# create empty sample processor
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "w") as f:
f.write("{}")
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_from_pretrained_dynamic_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor_updated")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=False
)
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True
)
self.assertTrue(processor.special_attribute_present)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
feature_extractor = processor.feature_extractor
self.assertTrue(feature_extractor.special_attribute_present)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
tokenizer = processor.tokenizer
self.assertTrue(tokenizer.special_attribute_present)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
new_processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor", trust_remote_code=True, use_fast=False
)
new_tokenizer = new_processor.tokenizer
self.assertTrue(new_tokenizer.special_attribute_present)
self.assertEqual(new_tokenizer.__class__.__name__, "NewTokenizerFast")
def test_new_processor_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
AutoProcessor.register(CustomConfig, CustomProcessor)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoProcessor.register(Wav2Vec2Config, Wav2Vec2Processor)
# Now that the config is registered, it can be used as any other config with the auto-API
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
with tempfile.TemporaryDirectory() as tmp_dir:
vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = CustomTokenizer(vocab_file)
processor = CustomProcessor(feature_extractor, tokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)
new_processor = AutoProcessor.from_pretrained(tmp_dir)
self.assertIsInstance(new_processor, CustomProcessor)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content:
del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
def test_from_pretrained_dynamic_processor_conflict(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
special_attribute_present = False
class NewTokenizer(BertTokenizer):
special_attribute_present = False
class NewProcessor(ProcessorMixin):
special_attribute_present = False
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor_updated")
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertFalse(processor.special_attribute_present)
self.assertFalse(processor.feature_extractor.special_attribute_present)
self.assertFalse(processor.tokenizer.special_attribute_present)
# If remote code is disabled, we load the local ones.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=False
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertFalse(processor.special_attribute_present)
self.assertFalse(processor.feature_extractor.special_attribute_present)
self.assertFalse(processor.tokenizer.special_attribute_present)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertFalse(processor.special_attribute_present)
self.assertFalse(processor.feature_extractor.special_attribute_present)
self.assertFalse(processor.tokenizer.special_attribute_present)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewFeatureExtractor.__module__ = "transformers.models.custom.feature_extraction_custom"
NewTokenizer.__module__ = "transformers.models.custom.tokenization_custom"
NewProcessor.__module__ = "transformers.models.custom.configuration_custom"
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertTrue(processor.special_attribute_present)
self.assertTrue(processor.feature_extractor.special_attribute_present)
self.assertTrue(processor.tokenizer.special_attribute_present)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content:
del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
def test_from_pretrained_dynamic_processor_with_extra_attributes(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
pass
class NewTokenizer(BertTokenizer):
pass
class NewProcessor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True):
super().__init__(feature_extractor, tokenizer)
self.processor_attr_1 = processor_attr_1
self.processor_attr_2 = processor_attr_2
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", processor_attr_2=False
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertEqual(processor.processor_attr_1, 1)
self.assertEqual(processor.processor_attr_2, False)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content:
del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
def test_dynamic_processor_with_specific_dynamic_subcomponents(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
pass
class NewTokenizer(BertTokenizer):
pass
class NewProcessor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated",
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content:
del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
def test_auto_processor_creates_tokenizer(self):
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(processor.__class__.__name__, "BertTokenizer")
def test_auto_processor_creates_image_processor(self):
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-convnext")
self.assertEqual(processor.__class__.__name__, "ConvNextImageProcessor")
def test_auto_processor_save_load(self):
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)
second_processor = AutoProcessor.from_pretrained(tmp_dir)
self.assertEqual(second_processor.__class__.__name__, processor.__class__.__name__)
def test_processor_with_multiple_tokenizers_save_load(self):
"""Test that processors with multiple tokenizers save and load correctly."""
class DualTokenizerProcessor(ProcessorMixin):
"""A processor with two tokenizers and an image processor."""
def __init__(self, tokenizer, decoder_tokenizer, image_processor):
super().__init__(tokenizer, decoder_tokenizer, image_processor)
# Create processor with multiple tokenizers
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")
decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
image_processor = SiglipImageProcessor()
processor = DualTokenizerProcessor(
tokenizer=tokenizer,
decoder_tokenizer=decoder_tokenizer,
image_processor=image_processor,
)
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)
# Verify directory structure: primary tokenizer in root, additional in subfolder
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "tokenizer_config.json")))
self.assertTrue(os.path.isdir(os.path.join(tmp_dir, "decoder_tokenizer")))
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "decoder_tokenizer", "tokenizer_config.json")))
# Verify processor_config.json contains image_processor but not tokenizers
with open(os.path.join(tmp_dir, "processor_config.json")) as f:
processor_config = json.load(f)
self.assertIn("image_processor", processor_config)
self.assertNotIn("tokenizer", processor_config)
self.assertNotIn("decoder_tokenizer", processor_config)
# Reload the full processor and verify all attributes
loaded_processor = DualTokenizerProcessor.from_pretrained(tmp_dir)
# Verify the processor has all expected attributes
self.assertTrue(hasattr(loaded_processor, "tokenizer"))
self.assertTrue(hasattr(loaded_processor, "decoder_tokenizer"))
self.assertTrue(hasattr(loaded_processor, "image_processor"))
# Verify tokenizers loaded correctly
self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size)
self.assertEqual(loaded_processor.decoder_tokenizer.vocab_size, decoder_tokenizer.vocab_size)
# Verify image processor loaded correctly
self.assertEqual(loaded_processor.image_processor.size, image_processor.size)
def test_processor_with_multiple_image_processors_save_load(self):
"""Test that processors with multiple image processors save and load correctly."""
class DualImageProcessorProcessor(ProcessorMixin):
"""A processor with two image processors and a tokenizer."""
def __init__(self, tokenizer, image_processor, encoder_image_processor):
super().__init__(tokenizer, image_processor, encoder_image_processor)
# Create processor with multiple image processors
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")
image_processor = SiglipImageProcessor(size={"height": 224, "width": 224})
encoder_image_processor = CLIPImageProcessor(size={"height": 384, "width": 384})
processor = DualImageProcessorProcessor(
tokenizer=tokenizer,
image_processor=image_processor,
encoder_image_processor=encoder_image_processor,
)
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)
# Verify processor_config.json contains both image processors
with open(os.path.join(tmp_dir, "processor_config.json")) as f:
processor_config = json.load(f)
self.assertIn("image_processor", processor_config)
self.assertIn("encoder_image_processor", processor_config)
self.assertNotIn("tokenizer", processor_config)
# Verify both image processors have the correct type key for instantiation
self.assertIn("image_processor_type", processor_config["image_processor"])
self.assertIn("image_processor_type", processor_config["encoder_image_processor"])
self.assertEqual(processor_config["image_processor"]["image_processor_type"], "SiglipImageProcessor")
self.assertEqual(processor_config["encoder_image_processor"]["image_processor_type"], "CLIPImageProcessor")
# Verify the sizes are different (to ensure they're separate configs)
self.assertEqual(processor_config["image_processor"]["size"], {"height": 224, "width": 224})
self.assertEqual(processor_config["encoder_image_processor"]["size"], {"height": 384, "width": 384})
# Reload the full processor and verify all attributes
loaded_processor = DualImageProcessorProcessor.from_pretrained(tmp_dir)
# Verify the processor has all expected attributes
self.assertTrue(hasattr(loaded_processor, "tokenizer"))
self.assertTrue(hasattr(loaded_processor, "image_processor"))
self.assertTrue(hasattr(loaded_processor, "encoder_image_processor"))
# Verify tokenizer loaded correctly
self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size)
# Verify image processors loaded correctly with their distinct sizes
self.assertEqual(loaded_processor.image_processor.size, {"height": 224, "width": 224})
self.assertEqual(loaded_processor.encoder_image_processor.size, {"height": 384, "width": 384})
# Verify they are different types
self.assertIsInstance(loaded_processor.image_processor, SiglipImageProcessor)
self.assertIsInstance(loaded_processor.encoder_image_processor, CLIPImageProcessor)
@is_staging_test
class ProcessorPushToHubTester(unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
@classmethod
def setUpClass(cls):
cls._token = TOKEN
def test_push_to_hub_via_save_pretrained(self):
with TemporaryHubRepo(token=self._token) as tmp_repo:
processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
# Push to hub via save_pretrained
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo.repo_id)
for k, v in processor.feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_processor.feature_extractor, k))
self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
def test_push_to_hub_in_organization_via_save_pretrained(self):
with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
# Push to hub via save_pretrained
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(
tmp_dir,
repo_id=tmp_repo.repo_id,
push_to_hub=True,
token=self._token,
)
new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo.repo_id)
for k, v in processor.feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_processor.feature_extractor, k))
self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
def test_push_to_hub_dynamic_processor(self):
with TemporaryHubRepo(token=self._token) as tmp_repo:
CustomFeatureExtractor.register_for_auto_class()
CustomTokenizer.register_for_auto_class()
CustomProcessor.register_for_auto_class()
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
with tempfile.TemporaryDirectory() as tmp_dir:
vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = CustomTokenizer(vocab_file)
processor = CustomProcessor(feature_extractor, tokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
snapshot_download(tmp_repo.repo_id, token=self._token)
processor.save_pretrained(tmp_dir)
# This has added the proper auto_map field to the feature extractor config
self.assertDictEqual(
processor.feature_extractor.auto_map,
{
"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor",
"AutoProcessor": "custom_processing.CustomProcessor",
},
)
# This has added the proper auto_map field to the tokenizer config
with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
tokenizer_config = json.load(f)
self.assertDictEqual(
tokenizer_config["auto_map"],
{
"AutoTokenizer": ["custom_tokenization.CustomTokenizer", None],
"AutoProcessor": "custom_processing.CustomProcessor",
},
)
# The code has been copied from fixtures
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_feature_extraction.py")))
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_tokenization.py")))
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_processing.py")))
upload_folder(repo_id=tmp_repo.repo_id, folder_path=tmp_dir, token=self._token)
new_processor = AutoProcessor.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
# Can't make an isinstance check because the new_processor is from the CustomProcessor class of a dynamic module
self.assertEqual(new_processor.__class__.__name__, "CustomProcessor")
def test_push_to_hub_with_chat_templates(self):
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer = LlamaTokenizer.from_pretrained(SAMPLE_VOCAB_LLAMA)
image_processor = SiglipImageProcessor()
chat_template = "default dummy template for testing purposes only"
processor = LlavaProcessor(
tokenizer=tokenizer, image_processor=image_processor, chat_template=chat_template
)
self.assertEqual(processor.chat_template, chat_template)
with TemporaryHubRepo(token=self._token) as tmp_repo:
processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, token=self._token, push_to_hub=True)
reloaded_processor = LlavaProcessor.from_pretrained(tmp_repo.repo_id)
self.assertEqual(processor.chat_template, reloaded_processor.chat_template)
# When we save as single files, tokenizers and processors share a chat template, which means
# the reloaded tokenizer should get the chat template as well
self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template)
with TemporaryHubRepo(token=self._token) as tmp_repo:
processor.chat_template = {"default": "a", "secondary": "b"}
processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, token=self._token, push_to_hub=True)
reloaded_processor = LlavaProcessor.from_pretrained(tmp_repo.repo_id)
self.assertEqual(processor.chat_template, reloaded_processor.chat_template)
# When we save as single files, tokenizers and processors share a chat template, which means
# the reloaded tokenizer should get the chat template as well
self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template)

View File

@@ -0,0 +1,810 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import sys
import tempfile
import unittest
from pathlib import Path
from unittest import mock
import pytest
import transformers
from transformers import (
AutoTokenizer,
BertConfig,
BertTokenizer,
BertTokenizerFast,
CTRLTokenizer,
GPT2Tokenizer,
HerbertTokenizer,
PreTrainedTokenizerFast,
PythonBackend,
Qwen2Tokenizer,
Qwen2TokenizerFast,
Qwen3MoeConfig,
RobertaTokenizer,
TokenizersBackend,
is_tokenizers_available,
logging,
)
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
from transformers.models.auto.tokenization_auto import (
REGISTERED_FAST_ALIASES,
REGISTERED_TOKENIZER_CLASSES,
TOKENIZER_MAPPING,
TOKENIZER_MAPPING_NAMES,
get_tokenizer_config,
tokenizer_class_from_name,
)
from transformers.models.roberta.configuration_roberta import RobertaConfig
from transformers.testing_utils import (
DUMMY_DIFF_TOKENIZER_IDENTIFIER,
DUMMY_UNKNOWN_IDENTIFIER,
SMALL_MODEL_IDENTIFIER,
CaptureLogger,
RequestCounter,
require_sentencepiece,
require_tokenizers,
slow,
)
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_tokenization import CustomTokenizer # noqa E402
if is_tokenizers_available():
from test_module.custom_tokenization_fast import CustomTokenizerFast
class AutoTokenizerTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
@slow
def test_tokenizer_from_pretrained(self):
for model_name in ("google-bert/bert-base-uncased", "google-bert/bert-base-cased"):
tokenizer = AutoTokenizer.from_pretrained(model_name)
self.assertIsNotNone(tokenizer)
self.assertIsInstance(tokenizer, (BertTokenizer))
self.assertGreater(len(tokenizer), 0)
for model_name in ["openai-community/gpt2", "openai-community/gpt2-medium"]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
self.assertIsNotNone(tokenizer)
self.assertIsInstance(tokenizer, (GPT2Tokenizer))
self.assertGreater(len(tokenizer), 0)
def test_tokenizer_from_pretrained_identifier(self):
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(tokenizer, (BertTokenizer))
self.assertEqual(tokenizer.vocab_size, 12)
def test_tokenizer_from_model_type(self):
tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
self.assertIsInstance(tokenizer, (RobertaTokenizer))
self.assertEqual(tokenizer.vocab_size, 20)
def test_tokenizer_from_tokenizer_class(self):
config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
self.assertIsInstance(config, RobertaConfig)
# Check that tokenizer_type ≠ model_type
tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
self.assertIsInstance(tokenizer, (BertTokenizer))
self.assertEqual(tokenizer.vocab_size, 12)
def test_tokenizer_from_type(self):
with tempfile.TemporaryDirectory() as tmp_dir:
shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))
tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert", use_fast=False)
self.assertIsInstance(tokenizer, BertTokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))
tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2", use_fast=False)
self.assertIsInstance(tokenizer, GPT2Tokenizer)
@require_tokenizers
def test_tokenizer_from_type_fast(self):
with tempfile.TemporaryDirectory() as tmp_dir:
shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))
tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert")
self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
with tempfile.TemporaryDirectory() as tmp_dir:
shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))
tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2")
self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
def test_tokenizer_from_type_incorrect_name(self):
with pytest.raises(ValueError):
AutoTokenizer.from_pretrained("./", tokenizer_type="xxx")
@require_tokenizers
def test_tokenizer_identifier_with_correct_config(self):
for tokenizer_class in [BertTokenizer, AutoTokenizer]:
tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
self.assertIsInstance(tokenizer, (BertTokenizer))
self.assertEqual(tokenizer.do_lower_case, False)
self.assertEqual(tokenizer.model_max_length, 512)
@require_tokenizers
def test_tokenizer_identifier_non_existent(self):
for tokenizer_class in [BertTokenizer, AutoTokenizer]:
with self.assertRaisesRegex(
EnvironmentError,
"julien-c/herlolip-not-exists is not a local folder and is not a valid model identifier",
):
_ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
def test_model_name_edge_cases_in_mappings(self):
# tests: https://github.com/huggingface/transformers/pull/13251
# 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
# 2. models that don't remap 1-1 from model-name to model file, e.g., openai-gpt -> openai
tokenizers = TOKENIZER_MAPPING.values()
tokenizer_names = []
for tokenizer_entry in tokenizers:
candidates = tokenizer_entry if isinstance(tokenizer_entry, tuple) else (tokenizer_entry,)
for tokenizer_cls in candidates:
if tokenizer_cls is not None:
tokenizer_names.append(tokenizer_cls.__name__)
for tokenizer_name in tokenizer_names:
# must find the right class
tokenizer_class_from_name(tokenizer_name)
def test_tokenizer_mapping_names_use_single_entries(self):
# this is just to ensure tokenizer mapping names are correct and map to strings!
invalid_entries = [
model_name
for model_name, tokenizer_entry in TOKENIZER_MAPPING_NAMES.items()
if isinstance(tokenizer_entry, (tuple, list))
]
self.assertListEqual(
invalid_entries,
[],
msg=(
"TOKENIZER_MAPPING_NAMES should map model types to single tokenizer class names. "
f"Found invalid mappings for: {invalid_entries}"
),
)
@require_tokenizers
def test_from_pretrained_use_fast_toggle(self):
self.assertIsInstance(
AutoTokenizer.from_pretrained("google-bert/bert-base-cased", use_fast=False), BertTokenizer
)
self.assertIsInstance(AutoTokenizer.from_pretrained("google-bert/bert-base-cased"), BertTokenizerFast)
@require_tokenizers
@slow
def test_custom_tokenizer_from_hub(self):
tokenizer = AutoTokenizer.from_pretrained(
"openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True, revision="fd7f352fac0e06d0d818b23f98e3ec8c64267a57"
)
self.assertTrue(tokenizer.__class__.__module__.startswith("transformers_modules."))
@require_tokenizers
@slow
def test_remote_code_imports_removed_fast_submodule(self):
# BC v5: remote tokenizer code may import from a deprecated tokenization_*_fast
tokenizer = AutoTokenizer.from_pretrained(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
trust_remote_code=True,
revision="a9af15a6372d7d6b25e9fb07c2ccb9e1fe645644",
)
self.assertGreater(len(tokenizer("hello world")["input_ids"]), 0)
@require_tokenizers
def test_voxtral_tokenizer_converts_from_tekken(self):
# Test that voxtral tokenizer loads correctly when falling back to TokenizersBackend
# (i.e., when MistralCommonBackend is not available)
repo_id = "mistralai/Voxtral-Mini-3B-2507"
# Simulate the fallback path by temporarily changing the mapping for voxtral
# from MistralCommonBackend to TokenizersBackend
with mock.patch.dict(TOKENIZER_MAPPING_NAMES, {"voxtral": "TokenizersBackend"}):
tokenizer = AutoTokenizer.from_pretrained(repo_id)
self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
self.assertTrue(tokenizer.is_fast)
self.assertGreater(len(tokenizer("Voxtral")["input_ids"]), 0)
@require_tokenizers
def test_do_lower_case(self):
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", do_lower_case=False)
sample = "Hello, world. How are you?"
tokens = tokenizer.tokenize(sample)
self.assertEqual("[UNK]", tokens[0])
tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
tokens = tokenizer.tokenize(sample)
self.assertEqual("[UNK]", tokens[0])
@require_tokenizers
def test_PreTrainedTokenizerFast_from_pretrained(self):
tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
self.assertEqual(tokenizer.model_max_length, 512)
self.assertEqual(tokenizer.vocab_size, 30000)
self.assertEqual(tokenizer.unk_token, "[UNK]")
self.assertEqual(tokenizer.padding_side, "right")
self.assertEqual(tokenizer.truncation_side, "right")
def test_auto_tokenizer_from_local_folder(self):
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(tokenizer, (BertTokenizer))
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)
self.assertIsInstance(tokenizer2, tokenizer.__class__)
self.assertEqual(tokenizer2.vocab_size, 12)
def test_auto_tokenizer_from_local_folder_mistral_detection(self):
"""See #42374 and #45444 for reference, ensuring proper mistral detection on local tokenizers"""
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-235B-A22B-Thinking-2507")
config = Qwen3MoeConfig.from_pretrained("Qwen/Qwen3-235B-A22B-Thinking-2507")
self.assertIsInstance(tokenizer, (Qwen2Tokenizer, Qwen2TokenizerFast))
mistral_warning = (
"with an incorrect regex pattern: "
"https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84"
"#69121093e8b480e709447d5e"
)
logger = logging.get_logger("transformers.tokenization_utils_tokenizers")
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
config_path = os.path.join(tmp_dir, "config.json")
def _write_config(**overrides):
config_dict = config.to_diff_dict()
for key, value in overrides.items():
if value is None:
config_dict.pop(key, None)
else:
config_dict[key] = value
with open(config_path, "w", encoding="utf-8") as f:
json.dump(config_dict, f, indent=2, sort_keys=True)
# Case 1: Tokenizer with no config associated must not warn
with CaptureLogger(logger) as cl:
AutoTokenizer.from_pretrained(tmp_dir)
self.assertNotIn(mistral_warning, cl.out)
# Case 2: Non-mistral local config must not warn for any `transformers_version`
for saved_version in ("4.57.2", "4.57.3", "4.57.6", "5.0.1"):
_write_config(transformers_version=saved_version)
with CaptureLogger(logger) as cl:
tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)
self.assertNotIn(
mistral_warning,
cl.out,
msg=f"Unexpected mistral regex warning for non-mistral config (transformers_version={saved_version!r})",
)
# Case 3: Mistral-family local config saved by an affected transformers release
# must still warn, even up to 4.57.6
for saved_version in ("4.57.3", "4.57.6"):
_write_config(model_type="mistral", transformers_version=saved_version)
with CaptureLogger(logger) as cl:
AutoTokenizer.from_pretrained(tmp_dir)
self.assertIn(
mistral_warning,
cl.out,
msg=f"Missing mistral regex warning for mistral config (transformers_version={saved_version!r})",
)
# Case 4: Mistral-family local config saved by a fixed transformers release must not warn
_write_config(model_type="mistral", transformers_version="5.0.1")
with CaptureLogger(logger) as cl:
AutoTokenizer.from_pretrained(tmp_dir)
self.assertNotIn(mistral_warning, cl.out)
self.assertIsInstance(tokenizer2, tokenizer.__class__)
self.assertTrue(tokenizer2.vocab_size > 100_000)
def test_auto_tokenizer_from_mistral_patching(self):
"""See #43376, regression when kwarg is manually passed to patch the regex in mistral tokenizers"""
AutoTokenizer.from_pretrained(
"mistralai/Ministral-3-3B-Instruct-2512", fix_mistral_regex=True
) # should not error
@require_tokenizers
def test_auto_tokenizer_loads_bloom_repo_without_tokenizer_class(self):
tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-BloomForCausalLM")
self.assertIsInstance(tokenizer, TokenizersBackend)
self.assertTrue(tokenizer.is_fast)
@require_tokenizers
def test_auto_tokenizer_loads_sentencepiece_only_repo(self):
tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-mbart")
self.assertIsInstance(tokenizer, TokenizersBackend)
self.assertTrue(tokenizer.is_fast)
def test_auto_tokenizer_fast_no_slow(self):
tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
# There is no fast CTRL so this always gives us a slow tokenizer.
self.assertIsInstance(tokenizer, CTRLTokenizer)
def test_get_tokenizer_config(self):
# Check we can load the tokenizer config of an online model.
config = get_tokenizer_config("google-bert/bert-base-cased")
_ = config.pop("_commit_hash", None)
# If we ever update google-bert/bert-base-cased tokenizer config, this dict here will need to be updated.
self.assertEqual(config, {"do_lower_case": False, "model_max_length": 512})
# This model does not have a tokenizer_config so we get back an empty dict.
config = get_tokenizer_config(SMALL_MODEL_IDENTIFIER)
self.assertDictEqual(config, {})
# A tokenizer saved with `save_pretrained` always creates a tokenizer config.
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
config = get_tokenizer_config(tmp_dir)
# Check the class of the tokenizer was properly saved (note that it always saves the slow class).
self.assertEqual(config["tokenizer_class"], "BertTokenizer")
def test_new_tokenizer_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer)
tokenizer = CustomTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
self.assertIsInstance(new_tokenizer, TokenizersBackend)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
@require_tokenizers
def test_new_tokenizer_fast_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
# Can register in two steps (fast takes precedence)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizer)
AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast)
self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizerFast)
del TOKENIZER_MAPPING._extra_content[CustomConfig]
# Can register in one step
AutoTokenizer.register(
CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast
)
self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizerFast)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast)
# We pass through a bert tokenizer fast cause there is no converter slow to fast for our new toknizer
# and that model does not have a tokenizer.json
with tempfile.TemporaryDirectory() as tmp_dir:
bert_tokenizer = BertTokenizerFast.from_pretrained(SMALL_MODEL_IDENTIFIER)
bert_tokenizer.save_pretrained(tmp_dir)
tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False)
self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizerFast", None)
REGISTERED_FAST_ALIASES.pop("CustomTokenizer", None)
def test_from_pretrained_dynamic_tokenizer(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
)
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
self.assertTrue(tokenizer.special_attribute_present)
# Test the dynamic module is loaded only once.
reloaded_tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True
)
self.assertIs(tokenizer.__class__, reloaded_tokenizer.__class__)
# Test tokenizer can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertTrue(reloaded_tokenizer.special_attribute_present)
if is_tokenizers_available():
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast")
# Test we can also load the slow version
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
)
self.assertTrue(tokenizer.special_attribute_present)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
# Test tokenizer can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True, use_fast=False)
self.assertTrue(
os.path.exists(os.path.join(tmp_dir, "tokenization.py"))
) # Assert we saved tokenizer code
self.assertEqual(reloaded_tokenizer._auto_class, "AutoTokenizer")
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "r") as f:
tokenizer_config = json.load(f)
# Assert we're pointing at local code and not another remote repo
self.assertEqual(
tokenizer_config["auto_map"]["AutoTokenizer"],
["tokenization.NewTokenizer", "tokenization_fast.NewTokenizerFast"],
)
self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertTrue(reloaded_tokenizer.special_attribute_present)
else:
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
# Test the dynamic module is reloaded if we force it.
reloaded_tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, force_download=True
)
self.assertIsNot(tokenizer.__class__, reloaded_tokenizer.__class__)
self.assertTrue(reloaded_tokenizer.special_attribute_present)
@slow
def test_custom_tokenizer_init(self):
tokenizer = AutoTokenizer.from_pretrained(
"Qwen/Qwen-VL", trust_remote_code=True, revision="0547ed36a86561e2e42fecec8fd0c4f6953e33c4"
)
self.assertIsInstance(tokenizer, PythonBackend)
self.assertGreater(len(tokenizer.get_vocab()), 0)
@require_tokenizers
def test_from_pretrained_dynamic_tokenizer_conflict(self):
class NewTokenizer(BertTokenizer):
special_attribute_present = False
try:
AutoConfig.register("custom", CustomConfig)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
# If remote code is not set, the default is to use local
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertFalse(tokenizer.special_attribute_present)
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertFalse(tokenizer.special_attribute_present)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertFalse(tokenizer.special_attribute_present)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewTokenizer.__module__ = "transformers.models.custom.configuration_custom"
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertTrue(tokenizer.special_attribute_present)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True
)
self.assertTrue(tokenizer.special_attribute_present)
if is_tokenizers_available():
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
# Test we can also load the slow version
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True, use_fast=False
)
self.assertTrue(tokenizer.special_attribute_present)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
else:
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
):
_ = AutoTokenizer.from_pretrained("bert-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
@unittest.skip("This test is failing on main") # TODO Matt/ydshieh, fix this test!
def test_cached_tokenizer_has_minimum_calls_to_head(self):
# Make sure we have cached the tokenizer.
_ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
with RequestCounter() as counter:
_ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(counter["GET"], 0)
self.assertEqual(counter["HEAD"], 1)
self.assertEqual(counter.total_calls, 1)
def test_init_tokenizer_with_trust(self):
nop_tokenizer_code = """
import transformers
class NopTokenizer(transformers.PreTrainedTokenizer):
def get_vocab(self):
return {}
"""
nop_config_code = """
from transformers import PreTrainedConfig
class NopConfig(PreTrainedConfig):
model_type = "test_unregistered_dynamic"
def __init__(self, **kwargs):
super().__init__(**kwargs)
"""
with tempfile.TemporaryDirectory() as tmp_dir:
fake_model_id = "hf-internal-testing/test_unregistered_dynamic"
fake_repo = os.path.join(tmp_dir, fake_model_id)
os.makedirs(fake_repo)
tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py")
with open(tokenizer_src_file, "w") as wfp:
wfp.write(nop_tokenizer_code)
model_config_src_file = os.path.join(fake_repo, "config.py")
with open(model_config_src_file, "w") as wfp:
wfp.write(nop_config_code)
config = {
"model_type": "test_unregistered_dynamic",
"auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"},
}
config_file = os.path.join(fake_repo, "config.json")
with open(config_file, "w") as wfp:
json.dump(config, wfp, indent=2)
tokenizer_config = {
"auto_map": {
"AutoTokenizer": [
f"{fake_model_id}--tokenizer.NopTokenizer",
None,
]
}
}
tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json")
with open(tokenizer_config_file, "w") as wfp:
json.dump(tokenizer_config, wfp, indent=2)
prev_dir = os.getcwd()
try:
# it looks like subdir= is broken in the from_pretrained also, so this is necessary
os.chdir(tmp_dir)
# this should work because we trust the code
_ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True)
try:
# this should fail because we don't trust and we're not at a terminal for interactive response
_ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=False)
self.fail("AutoTokenizer.from_pretrained with trust_remote_code=False should raise ValueException")
except ValueError:
pass
finally:
os.chdir(prev_dir)
def test_tokenization_class_priority(self):
from transformers import AutoProcessor
tok = AutoTokenizer.from_pretrained("mlx-community/MiniMax-M2.1-4bit")
self.assertTrue(tok.__class__ == TokenizersBackend)
tok = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
self.assertTrue(tok.__class__ == HerbertTokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
tok.save_pretrained(tmp_dir)
tok2 = AutoTokenizer.from_pretrained(tmp_dir)
self.assertTrue(tok2.__class__ == HerbertTokenizer)
tok = AutoProcessor.from_pretrained("mistralai/Ministral-3-8B-Instruct-2512-BF16").tokenizer
self.assertTrue(tok.__class__ == TokenizersBackend)
def test_custom_tokenizer_with_mismatched_tokenizer_class(self):
nop_tokenizer_code = """
import transformers
class NopTokenizer(transformers.PreTrainedTokenizer):
special_attribute_present = True
def get_vocab(self):
return {}
"""
nop_config_code = """
from transformers import PreTrainedConfig
class NopConfig(PreTrainedConfig):
model_type = "test_unregistered_dynamic"
def __init__(self, **kwargs):
super().__init__(**kwargs)
"""
with tempfile.TemporaryDirectory() as tmp_dir:
fake_model_id = "hf-internal-testing/test_unregistered_dynamic"
fake_repo = os.path.join(tmp_dir, fake_model_id)
os.makedirs(fake_repo)
tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py")
with open(tokenizer_src_file, "w") as wfp:
wfp.write(nop_tokenizer_code)
model_config_src_file = os.path.join(fake_repo, "config.py")
with open(model_config_src_file, "w") as wfp:
wfp.write(nop_config_code)
config = {
"model_type": "test_unregistered_dynamic",
"auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"},
}
config_file = os.path.join(fake_repo, "config.json")
with open(config_file, "w") as wfp:
json.dump(config, wfp, indent=2)
tokenizer_config = {
"tokenizer_class": "NopTokenizer",
"auto_map": {
"AutoTokenizer": [
f"{fake_model_id}--tokenizer.NopTokenizer",
None,
]
},
}
tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json")
with open(tokenizer_config_file, "w") as wfp:
json.dump(tokenizer_config, wfp, indent=2)
prev_dir = os.getcwd()
try:
os.chdir(tmp_dir)
tokenizer = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True)
self.assertEqual(tokenizer.__class__.__name__, "NopTokenizer")
self.assertTrue(tokenizer.special_attribute_present)
finally:
os.chdir(prev_dir)
@require_tokenizers
@require_sentencepiece
def test_mismatched_model_type_uses_config_tokenizer_class_with_sentencepiece(self):
tokenizer = AutoTokenizer.from_pretrained(
"facebook/nllb-200-distilled-600M",
revision="f8d333a098d19b4fd9a8b18f94170487ad3f821d",
)
self.assertEqual(tokenizer.__class__.__name__, "NllbTokenizer")
@require_tokenizers
def test_mismatched_model_type_uses_config_tokenizer_class_without_sentencepiece(self):
with mock.patch("transformers.models.auto.tokenization_auto.is_sentencepiece_available", return_value=False):
tokenizer = AutoTokenizer.from_pretrained(
"facebook/nllb-200-distilled-600M",
revision="f8d333a098d19b4fd9a8b18f94170487ad3f821d",
)
self.assertEqual(tokenizer.__class__.__name__, "NllbTokenizer")
@slow
@require_tokenizers
def test_deepseek_r1_tokenizer_preserves_spaces(self):
"""Regression: deepseek_v3 Hub config has wrong tokenizer_class='LlamaTokenizerFast'; must use TokenizersBackend."""
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
self.assertIsInstance(tokenizer, TokenizersBackend)
text = "hello world"
self.assertEqual(tokenizer.decode(tokenizer.encode(text)), text)
@slow
@require_tokenizers
def test_deepseek_r1_distill_qwen_uses_qwen2_tokenizer(self):
"""Regression: qwen2 model with wrong Hub tokenizer_class='LlamaTokenizerFast' must use Qwen2Tokenizer."""
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
self.assertIsInstance(tokenizer, Qwen2Tokenizer)
@require_tokenizers
@require_sentencepiece
def test_specialized_hub_tokenizer_class_overrides_mismatched_auto_mapping(self):
"""Hub's tokenizer_class wins when the auto-mapping has a different real class (e.g. m2m_100 → NllbTokenizer)."""
from transformers import NllbTokenizer
fake_config = mock.MagicMock()
fake_config.model_type = "m2m_100"
mock_tokenizer = mock.MagicMock(spec=NllbTokenizer)
with (
mock.patch(
"transformers.models.auto.tokenization_auto.AutoConfig.from_pretrained",
return_value=fake_config,
),
mock.patch(
"transformers.models.auto.tokenization_auto.get_tokenizer_config",
return_value={"tokenizer_class": "NllbTokenizer"},
),
mock.patch.object(NllbTokenizer, "from_pretrained", return_value=mock_tokenizer) as mock_nllb,
mock.patch.object(TokenizersBackend, "from_pretrained") as mock_tb,
):
result = AutoTokenizer.from_pretrained("fake/nllb-model")
mock_nllb.assert_called_once()
mock_tb.assert_not_called()
self.assertIs(result, mock_tokenizer)

View File

@@ -0,0 +1,248 @@
# Copyright 2025 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import sys
import tempfile
import unittest
from pathlib import Path
import transformers
from transformers import (
CONFIG_MAPPING,
VIDEO_PROCESSOR_MAPPING,
AutoConfig,
AutoVideoProcessor,
LlavaOnevisionConfig,
LlavaOnevisionVideoProcessor,
)
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torch
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_video_processing import CustomVideoProcessor # noqa E402
@require_torch
class AutoVideoProcessorTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_video_processor_from_model_shortcut(self):
config = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_video_processor_from_local_directory_from_key(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
config = AutoVideoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_video_processor_from_local_directory_from_preprocessor_key(self):
# Ensure we can load the image processor from the feature extractor config
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
config = AutoVideoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_video_processor_from_local_directory_from_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = LlavaOnevisionConfig()
# Create a dummy config file with image_processor_type
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
# remove video_processor_type to make sure config.json alone is enough to load image processor locally
config_dict = AutoVideoProcessor.from_pretrained(tmpdirname).to_dict()
config_dict.pop("video_processor_type")
config = LlavaOnevisionVideoProcessor(**config_dict)
# save in new folder
model_config.save_pretrained(tmpdirname)
config.save_pretrained(tmpdirname)
config = AutoVideoProcessor.from_pretrained(tmpdirname)
# make sure private variable is not incorrectly saved
dict_as_saved = json.loads(config.to_json_string())
self.assertTrue("_processor_class" not in dict_as_saved)
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_video_processor_from_local_file(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
config = AutoVideoProcessor.from_pretrained(processor_tmpfile)
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"llava-hf/llava-doesnt-exist is not a local folder and is not a valid model identifier",
):
_ = AutoVideoProcessor.from_pretrained("llava-hf/llava-doesnt-exist")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoVideoProcessor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_video_processor_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"Can't load video processor for 'hf-internal-testing/config-no-model'.",
):
_ = AutoVideoProcessor.from_pretrained("hf-internal-testing/config-no-model")
def test_from_pretrained_dynamic_video_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
video_processor = AutoVideoProcessor.from_pretrained("hf-internal-testing/test_dynamic_video_processor")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=False
)
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
)
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
# Test the dynamic module is loaded only once.
reloaded_video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
)
self.assertIs(video_processor.__class__, reloaded_video_processor.__class__)
# Test image processor can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
video_processor.save_pretrained(tmp_dir)
reloaded_video_processor = AutoVideoProcessor.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_video_processor.__class__.__name__, "NewVideoProcessor")
def test_new_video_processor_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoVideoProcessor.register(CustomConfig, CustomVideoProcessor)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoVideoProcessor.register(LlavaOnevisionConfig, LlavaOnevisionVideoProcessor)
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
video_processor = CustomVideoProcessor.from_pretrained(tmpdirname)
# Now that the config is registered, it can be used as any other config with the auto-API
with tempfile.TemporaryDirectory() as tmp_dir:
video_processor.save_pretrained(tmp_dir)
new_video_processor = AutoVideoProcessor.from_pretrained(tmp_dir)
self.assertIsInstance(new_video_processor, CustomVideoProcessor)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in VIDEO_PROCESSOR_MAPPING._extra_content:
del VIDEO_PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_video_processor_conflict(self):
class NewVideoProcessor(LlavaOnevisionVideoProcessor):
is_local = True
try:
AutoConfig.register("custom", CustomConfig)
AutoVideoProcessor.register(CustomConfig, NewVideoProcessor)
# If remote code is not set, the default is to use local
video_processor = AutoVideoProcessor.from_pretrained("hf-internal-testing/test_dynamic_video_processor")
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
self.assertTrue(video_processor.is_local)
# If remote code is disabled, we load the local one.
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=False
)
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
self.assertTrue(video_processor.is_local)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
)
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
self.assertTrue(video_processor.is_local)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewVideoProcessor.__module__ = "transformers.models.custom.configuration_custom"
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
)
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
self.assertTrue(not hasattr(video_processor, "is_local"))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in VIDEO_PROCESSOR_MAPPING._extra_content:
del VIDEO_PROCESSOR_MAPPING._extra_content[CustomConfig]

View File

View File

@@ -0,0 +1,477 @@
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Autoformer model."""
import inspect
import tempfile
import unittest
import pytest
from huggingface_hub import hf_hub_download
from transformers import is_torch_available
from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
from transformers.utils import check_torch_load_is_safe
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
TOLERANCE = 1e-4
if is_torch_available():
import torch
from transformers import AutoformerConfig, AutoformerForPrediction, AutoformerModel
from transformers.models.autoformer.modeling_autoformer import AutoformerDecoder, AutoformerEncoder
@require_torch
class AutoformerModelTester:
def __init__(
self,
parent,
d_model=16,
batch_size=13,
prediction_length=7,
context_length=14,
label_length=10,
cardinality=19,
embedding_dimension=5,
num_time_features=4,
is_training=True,
hidden_size=16,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=4,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
lags_sequence=[1, 2, 3, 4, 5],
moving_average=25,
autocorrelation_factor=5,
):
self.d_model = d_model
self.parent = parent
self.batch_size = batch_size
self.prediction_length = prediction_length
self.context_length = context_length
self.cardinality = cardinality
self.num_time_features = num_time_features
self.lags_sequence = lags_sequence
self.embedding_dimension = embedding_dimension
self.is_training = is_training
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.encoder_seq_length = context_length
self.decoder_seq_length = prediction_length + label_length
self.label_length = label_length
self.moving_average = moving_average
self.autocorrelation_factor = autocorrelation_factor
def get_config(self):
return AutoformerConfig(
d_model=self.d_model,
encoder_layers=self.num_hidden_layers,
decoder_layers=self.num_hidden_layers,
encoder_attention_heads=self.num_attention_heads,
decoder_attention_heads=self.num_attention_heads,
encoder_ffn_dim=self.intermediate_size,
decoder_ffn_dim=self.intermediate_size,
dropout=self.hidden_dropout_prob,
attention_dropout=self.attention_probs_dropout_prob,
prediction_length=self.prediction_length,
context_length=self.context_length,
label_length=self.label_length,
lags_sequence=self.lags_sequence,
num_time_features=self.num_time_features,
num_static_categorical_features=1,
cardinality=[self.cardinality],
embedding_dimension=[self.embedding_dimension],
moving_average=self.moving_average,
scaling="std", # we need std to get non-zero `loc`
)
def prepare_autoformer_inputs_dict(self, config):
_past_length = config.context_length + max(config.lags_sequence)
static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
past_values = floats_tensor([self.batch_size, _past_length])
past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
# decoder inputs
future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
future_values = floats_tensor([self.batch_size, config.prediction_length])
inputs_dict = {
"past_values": past_values,
"static_categorical_features": static_categorical_features,
"past_time_features": past_time_features,
"past_observed_mask": past_observed_mask,
"future_time_features": future_time_features,
"future_values": future_values,
}
return inputs_dict
def prepare_config_and_inputs(self):
config = self.get_config()
inputs_dict = self.prepare_autoformer_inputs_dict(config)
return config, inputs_dict
def prepare_config_and_inputs_for_common(self):
config, inputs_dict = self.prepare_config_and_inputs()
return config, inputs_dict
def check_encoder_decoder_model_standalone(self, config, inputs_dict):
model = AutoformerModel(config=config).to(torch_device).eval()
outputs = model(**inputs_dict)
encoder_last_hidden_state = outputs.encoder_last_hidden_state
last_hidden_state = outputs.last_hidden_state
with tempfile.TemporaryDirectory() as tmpdirname:
encoder = model.get_encoder()
encoder.save_pretrained(tmpdirname)
encoder = AutoformerEncoder.from_pretrained(tmpdirname).to(torch_device)
transformer_inputs, feature, _, _, _ = model.create_network_inputs(**inputs_dict)
seasonal_input, trend_input = model.decomposition_layer(transformer_inputs[:, : config.context_length, ...])
enc_input = torch.cat(
(transformer_inputs[:, : config.context_length, ...], feature[:, : config.context_length, ...]),
dim=-1,
)
encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
mean = (
torch.mean(transformer_inputs[:, : config.context_length, ...], dim=1)
.unsqueeze(1)
.repeat(1, config.prediction_length, 1)
)
zeros = torch.zeros(
[transformer_inputs.shape[0], config.prediction_length, transformer_inputs.shape[2]],
device=enc_input.device,
)
dec_input = torch.cat(
(
torch.cat((seasonal_input[:, -config.label_length :, ...], zeros), dim=1),
feature[:, config.context_length - config.label_length :, ...],
),
dim=-1,
)
trend_init = torch.cat(
(
torch.cat((trend_input[:, -config.label_length :, ...], mean), dim=1),
feature[:, config.context_length - config.label_length :, ...],
),
dim=-1,
)
with tempfile.TemporaryDirectory() as tmpdirname:
decoder = model.get_decoder()
decoder.save_pretrained(tmpdirname)
decoder = AutoformerDecoder.from_pretrained(tmpdirname).to(torch_device)
last_hidden_state_2 = decoder(
trend=trend_init,
inputs_embeds=dec_input,
encoder_hidden_states=encoder_last_hidden_state,
)[0]
self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
@require_torch
class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (AutoformerModel, AutoformerForPrediction) if is_torch_available() else ()
pipeline_model_mapping = {"feature-extraction": AutoformerModel} if is_torch_available() else {}
test_missing_keys = False
test_inputs_embeds = False
test_torch_exportable = False # not worth to fix
def setUp(self):
self.model_tester = AutoformerModelTester(self)
self.config_tester = ConfigTester(self, config_class=AutoformerConfig, has_text_modality=False)
# TODO: (ydshieh) Fix the wrong logic for `tmp_delay` is possible
@unittest.skip(
reason="The computation of `tmp_delay` in `AutoformerAttention.forward` seems wrong, see PR #12345. Also `topk` is used to compute indices which is not stable."
)
def test_batching_equivalence(self):
super().test_batching_equivalence()
def test_config(self):
self.config_tester.run_common_tests()
def test_save_load_strict(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
for model_class in self.all_model_classes:
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
self.assertEqual(info["missing_keys"], set())
def test_encoder_decoder_model_standalone(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
@unittest.skip(reason="Model has no tokens embeddings")
def test_resize_tokens_embeddings(self):
pass
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing(self):
super().test_training_gradient_checkpointing()
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing_use_reentrant_false(self):
super().test_training_gradient_checkpointing_use_reentrant_false()
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing_use_reentrant_true(self):
super().test_training_gradient_checkpointing_use_reentrant_true()
# # Input is 'static_categorical_features' not 'input_ids'
def test_model_main_input_name(self):
model_signature = inspect.signature(getattr(AutoformerModel, "forward"))
# The main input is the name of the argument after `self`
observed_main_input_name = list(model_signature.parameters.keys())[1]
self.assertEqual(AutoformerModel.main_input_name, observed_main_input_name)
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = [
"past_values",
"past_time_features",
"past_observed_mask",
"static_categorical_features",
"static_real_features",
"future_values",
"future_time_features",
]
if model.__class__.__name__ == "AutoformerForPrediction":
expected_arg_names.extend(["future_observed_mask"])
expected_arg_names.extend(
[
"decoder_attention_mask",
"encoder_outputs",
"past_key_values",
"use_cache",
]
)
if model.__class__.__name__ == "AutoformerModel":
expected_arg_names.extend(["kwargs"])
elif model.__class__.__name__ == "AutoformerForPrediction":
expected_arg_names.extend(["kwargs"])
self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
seq_len = getattr(self.model_tester, "seq_length", None)
decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
d_model = getattr(self.model_tester, "d_model", None)
num_attention_heads = getattr(self.model_tester, "num_attention_heads", None)
dim = d_model // num_attention_heads
for model_class in self.all_model_classes:
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class._from_config(config, attn_implementation="eager")
config = model.config
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.encoder_attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, encoder_seq_length, dim],
)
out_len = len(outputs)
correct_outlen = 7
if "last_hidden_state" in outputs:
correct_outlen += 1
if "trend" in outputs:
correct_outlen += 1
if "past_key_values" in outputs:
correct_outlen += 1 # past_key_values have been returned
if "loss" in outputs:
correct_outlen += 1
if "params" in outputs:
correct_outlen += 1
self.assertEqual(out_len, correct_outlen)
# decoder attentions
decoder_attentions = outputs.decoder_attentions
self.assertIsInstance(decoder_attentions, (list, tuple))
self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(decoder_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, decoder_seq_length, dim],
)
# cross attentions
cross_attentions = outputs.cross_attentions
self.assertIsInstance(cross_attentions, (list, tuple))
self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(cross_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, decoder_seq_length, dim],
)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
self.assertEqual(out_len + 2, len(outputs))
self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, encoder_seq_length, dim],
)
@is_flaky()
def test_retain_grad_hidden_states_attentions(self):
super().test_retain_grad_hidden_states_attentions()
@unittest.skip(reason="Model does not have input embeddings")
def test_model_get_set_embeddings(self):
pass
def prepare_batch(filename="train-batch.pt"):
file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
check_torch_load_is_safe()
batch = torch.load(file, map_location=torch_device, weights_only=True)
return batch
@require_torch
@slow
class AutoformerModelIntegrationTests(unittest.TestCase):
def test_inference_no_head(self):
model = AutoformerModel.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
batch = prepare_batch()
with torch.no_grad():
output = model(
past_values=batch["past_values"],
past_time_features=batch["past_time_features"],
past_observed_mask=batch["past_observed_mask"],
static_categorical_features=batch["static_categorical_features"],
future_values=batch["future_values"],
future_time_features=batch["future_time_features"],
)[0]
expected_shape = torch.Size(
(64, model.config.prediction_length + model.config.label_length, model.config.feature_size)
)
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[0.3593, -1.3398, 0.6330], [0.2279, 1.5396, -0.1792], [0.0450, 1.3225, -0.2335]], device=torch_device
)
torch.testing.assert_close(output[0, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE)
def test_inference_head(self):
model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
batch = prepare_batch("val-batch.pt")
with torch.no_grad():
output = model(
past_values=batch["past_values"],
past_time_features=batch["past_time_features"],
past_observed_mask=batch["past_observed_mask"],
static_categorical_features=batch["static_categorical_features"],
).encoder_last_hidden_state
expected_shape = torch.Size((64, model.config.context_length, model.config.d_model))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[-0.0734, -0.9036, 0.8358], [4.7186, 2.4113, 1.9581], [1.7953, 2.3558, 1.2970]], device=torch_device
)
torch.testing.assert_close(output[0, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE)
def test_seq_to_seq_generation(self):
model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly").to(torch_device)
batch = prepare_batch("val-batch.pt")
with torch.no_grad():
outputs = model.generate(
static_categorical_features=batch["static_categorical_features"],
past_time_features=batch["past_time_features"],
past_values=batch["past_values"],
future_time_features=batch["future_time_features"],
past_observed_mask=batch["past_observed_mask"],
)
expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length))
self.assertEqual(outputs.sequences.shape, expected_shape)
expected_slice = torch.tensor([3130.6763, 4056.5293, 7053.0786], device=torch_device)
mean_prediction = outputs.sequences.mean(dim=1)
torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1, atol=1e-1)

View File

View File

@@ -0,0 +1,501 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch GotOcr2 model."""
import unittest
import pytest
from transformers import (
AutoProcessor,
AyaVisionConfig,
BitsAndBytesConfig,
is_torch_available,
)
from transformers.testing_utils import (
Expectations,
cleanup,
get_device_properties,
require_deterministic_for_xpu,
require_torch,
require_torch_accelerator,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
AyaVisionForConditionalGeneration,
AyaVisionModel,
)
class AyaVisionVisionText2TextModelTester:
def __init__(
self,
parent,
batch_size=3,
seq_length=7,
vision_feature_layer=-1,
downsample_factor=2,
ignore_index=-100,
bos_token_id=0,
eos_token_id=0,
pad_token_id=0,
image_token_index=2,
num_channels=3,
image_size=64,
model_type="aya_vision",
is_training=True,
text_config={
"model_type": "cohere2",
"vocab_size": 99,
"hidden_size": 128,
"intermediate_size": 37,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"output_channels": 64,
"hidden_act": "silu",
"max_position_embeddings": 512,
"tie_word_embeddings": True,
"bos_token_id": 0,
"eos_token_id": 0,
"pad_token_id": 0,
},
vision_config={
"model_type": "siglip_vision_model",
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 128,
"image_size": 64,
"patch_size": 8,
"vision_use_head": False,
},
):
self.parent = parent
self.ignore_index = ignore_index
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.image_token_index = image_token_index
self.model_type = model_type
self.text_config = text_config
self.vision_config = vision_config
self.batch_size = batch_size
self.vision_feature_layer = vision_feature_layer
self.downsample_factor = downsample_factor
self.is_training = is_training
self.num_channels = num_channels
self.image_size = image_size
self.image_seq_length = (image_size // (vision_config["patch_size"] * downsample_factor)) ** 2
self.seq_length = seq_length + self.image_seq_length
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
def get_config(self):
return AyaVisionConfig(
text_config=self.text_config,
vision_config=self.vision_config,
model_type=self.model_type,
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
pad_token_id=self.pad_token_id,
image_token_index=self.image_token_index,
vision_feature_layer=self.vision_feature_layer,
downsample_factor=self.downsample_factor,
)
def prepare_config_and_inputs(self):
config = self.get_config()
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
# input_ids[:, -1] = self.pad_token_id
input_ids[input_ids == self.image_token_index] = self.pad_token_id
input_ids[:, : self.image_seq_length] = self.image_token_index
inputs_dict = {
"pixel_values": pixel_values,
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(
AyaVisionModel,
AyaVisionForConditionalGeneration,
)
if is_torch_available()
else ()
)
all_generative_model_classes = (AyaVisionForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = (
{
"image-text-to-text": AyaVisionForConditionalGeneration,
"any-to-any": AyaVisionForConditionalGeneration,
}
if is_torch_available()
else {}
)
_is_composite = True
def setUp(self):
self.model_tester = AyaVisionVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=AyaVisionConfig, has_text_modality=False)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="SiglipVisionModel does not support standalone training")
def test_training(self):
pass
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing(self):
super().test_training_gradient_checkpointing()
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing_use_reentrant_false(self):
super().test_training_gradient_checkpointing_use_reentrant_false()
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing_use_reentrant_true(self):
super().test_training_gradient_checkpointing_use_reentrant_true()
@unittest.skip(reason="Compile not yet supported because in LLava models")
@pytest.mark.torch_compile_test
def test_sdpa_can_compile_dynamic(self):
pass
# todo: yoni - fix or improve the test
@unittest.skip("Difference is slightly higher than the threshold")
def test_batching_equivalence(self):
pass
@require_torch
class AyaVisionIntegrationTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_checkpoint = "CohereForAI/aya-vision-8b"
cls.model = None
@classmethod
def tearDownClass(cls):
del cls.model
cleanup(torch_device, gc_collect=True)
def tearDown(self):
cleanup(torch_device, gc_collect=True)
@classmethod
def get_model(cls):
# Use 4-bit on T4
device_type, major, _ = get_device_properties()
load_in_4bit = (device_type == "cuda") and (major < 8)
dtype = None if load_in_4bit else torch.float16
if load_in_4bit:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
else:
quantization_config = None
if cls.model is None:
cls.model = AyaVisionForConditionalGeneration.from_pretrained(
cls.model_checkpoint, device_map=torch_device, dtype=dtype, quantization_config=quantization_config
)
return cls.model
@slow
@require_torch_accelerator
def test_small_model_integration_forward(self):
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
model = self.get_model()
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
{"type": "text", "text": "Please describe the image explicitly."},
],
}
]
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(torch_device, dtype=torch.float16)
# Forward
with torch.inference_mode():
output = model(**inputs)
actual_logits = output.logits[0, -1, :5].cpu()
EXPECTED_LOGITS = Expectations(
{
("xpu", 3): [1.6699, 0.6260, 3.2266, 8.5547, 2.209],
# 4-bit
("cuda", 7): [0.1097, 0.3481, 3.8340, 9.7969, 2.0488],
("cuda", 8): [1.6396, 0.6094, 3.1992, 8.5234, 2.1875],
}
) # fmt: skip
expected_logits = torch.tensor(EXPECTED_LOGITS.get_expectation(), dtype=torch.float16)
self.assertTrue(
torch.allclose(actual_logits, expected_logits, atol=0.1),
f"Actual logits: {actual_logits}"
f"\nExpected logits: {expected_logits}"
f"\nDifference: {torch.abs(actual_logits - expected_logits)}",
)
@slow
@require_torch_accelerator
@require_deterministic_for_xpu
def test_small_model_integration_generate_text_only(self):
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
model = self.get_model()
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Write a haiku"},
],
}
]
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=25, do_sample=False)
decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
expected_outputs = Expectations(
{
("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit sky,\nNature's quiet song.",
# 4-bit
("cuda", 7): "Sure, here's a haiku for you:\n\nMorning dew sparkles,\nPetals unfold in sunlight,\n",
("cuda", 8): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.",
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(decoded_output, expected_output)
@slow
@require_torch_accelerator
@require_deterministic_for_xpu
def test_small_model_integration_generate_chat_template(self):
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
model = self.get_model()
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
{"type": "text", "text": "Please describe the image explicitly."},
],
}
]
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
expected_outputs = Expectations(
{
("xpu", 3): 'The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,',
# 4-bit
("cuda", 7): 'The image depicts two cats comfortably resting on a pink blanket spread across a sofa. The cats,',
("cuda", 8): 'The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(decoded_output, expected_output)
@slow
@require_torch_accelerator
def test_small_model_integration_batched_generate(self):
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
model = self.get_model()
# Prepare inputs
messages = [
[
{
"role": "user",
"content": [
{"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
{"type": "text", "text": "Write a haiku for this image"},
],
},
],
[
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
},
{"type": "text", "text": "Describe this image"},
],
},
],
]
inputs = processor.apply_chat_template(
messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(model.device, dtype=torch.float16)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
# Check first output
decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
expected_outputs = Expectations(
{
("xpu", 3): "Wooden bridge stretches\nInto still waters, mountains gleam\nPeaceful forest scene",
# 4-bit
("cuda", 8): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
# Check second output
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
expected_outputs = Expectations(
{
("xpu", 3): 'This vibrant image captures a bustling street scene in a Chinese-influenced neighborhood. The focal point is a striking red stop sign',
# 4-bit
("cuda", 7): 'This vibrant image captures a bustling street scene in a multicultural urban area, featuring a traditional Chinese gate adorned with intricate red and',
("cuda", 8): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
@slow
@require_torch_accelerator
@require_deterministic_for_xpu
def test_small_model_integration_batched_generate_multi_image(self):
processor = AutoProcessor.from_pretrained(self.model_checkpoint)
model = self.get_model()
# Prepare inputs
messages = [
[
{
"role": "user",
"content": [
{"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
{"type": "text", "text": "Write a haiku for this image"},
],
},
],
[
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
},
{
"type": "image",
"url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
},
{
"type": "text",
"text": "These images depict two different landmarks. Can you identify them?",
},
],
},
],
]
inputs = processor.apply_chat_template(
messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(model.device, dtype=torch.float16)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
# Check first output
decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
expected_outputs = Expectations(
{
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.",
("cuda", 8): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
# Check second output
decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
expected_outputs = Expectations(
{
("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",
("cuda", 8): 'The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)

View File

@@ -0,0 +1,168 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import AyaVisionProcessor
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
if is_torch_available():
import torch
@require_vision
class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = AyaVisionProcessor
model_id = "hf-internal-testing/namespace-CohereForAI-repo_name_aya-vision-8b"
@classmethod
def _setup_test_attributes(cls, processor):
cls.image_token = processor.image_token
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
return tokenizer_class.from_pretrained(cls.model_id, padding_side="left")
@classmethod
def _setup_image_processor(cls):
image_processor_class = cls._get_component_class_from_processor("image_processor")
return image_processor_class(
do_resize=True,
size={"height": 20, "width": 20},
max_patches=2,
do_rescale=True,
rescale_factor=1 / 255,
do_normalize=True,
image_mean=[0.485, 0.456, 0.406],
image_std=[0.229, 0.224, 0.225],
do_convert_rgb=True,
)
@staticmethod
def prepare_processor_dict():
return {"patch_size": 10, "img_size": 20}
@unittest.skip(reason="Text needs image tokens, tested in other tests")
def test_processor_with_multiple_inputs(self):
pass
def test_get_num_vision_tokens(self):
"Tests general functionality of the helper used internally in vLLM"
processor = self.get_processor()
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
self.assertTrue("num_image_tokens" in output)
self.assertEqual(len(output["num_image_tokens"]), 3)
self.assertTrue("num_image_patches" in output)
self.assertEqual(len(output["num_image_patches"]), 3)
@require_torch
def test_process_interleaved_images_videos(self):
processor = self.get_processor()
messages = [
[
{
"role": "user",
"content": [
{
"type": "image",
"url": url_to_local_path(
"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
),
},
{
"type": "image",
"url": url_to_local_path(
"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
),
},
{"type": "text", "text": "What are the differences between these two images?"},
],
},
],
[
{
"role": "user",
"content": [
{
"type": "image",
"url": url_to_local_path("https://llava-vl.github.io/static/images/view.jpg"),
},
{"type": "text", "text": "Write a haiku for this image"},
],
}
],
]
inputs_batched = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
padding=True,
)
# Process non batched inputs to check if the pixel_values and input_ids are reconstructed in the correct order when batched together
images_patches_index = 0
for i, message in enumerate(messages):
inputs = processor.apply_chat_template(
message,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
padding=True,
)
# We slice with [-inputs["input_ids"].shape[1] :] as the input_ids are left padded
torch.testing.assert_close(
inputs["input_ids"][0], inputs_batched["input_ids"][i][-inputs["input_ids"].shape[1] :]
)
torch.testing.assert_close(
inputs["pixel_values"],
inputs_batched["pixel_values"][
images_patches_index : images_patches_index + inputs["pixel_values"].shape[0]
],
)
images_patches_index += inputs["pixel_values"].shape[0]
def test_image_processor_defaults(self):
# AyaVisionProcessor has a default value `crop_to_patches=True` but the image processor's
# default is different. Override and pass the arg explicitly
image_processor = self.get_component("image_processor")
# Get all required components for processor
components = {}
for attribute in self.processor_class.get_attributes():
components[attribute] = self.get_component(attribute)
processor = self.processor_class(**components)
image_input = self.prepare_image_inputs()
input_image_proc = image_processor(image_input, crop_to_patches=False, return_tensors="pt")
input_processor = processor(images=image_input, crop_to_patches=False, return_tensors="pt")
# Verify outputs match
for key in input_image_proc:
if key in processor.model_input_names:
torch.testing.assert_close(input_image_proc[key], input_processor[key])

View File

View File

@@ -0,0 +1,613 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Bamba model."""
import inspect
import tempfile
import unittest
import pytest
from pytest import mark
from transformers import (
AutoTokenizer,
BambaConfig,
DataCollatorWithFlattening,
is_torch_available,
)
from transformers.testing_utils import (
DeviceProperties,
Expectations,
get_device_properties,
require_deterministic_for_xpu,
require_flash_attn,
require_torch,
require_torch_accelerator,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import BambaForCausalLM, BambaModel
class BambaModelTester:
config_class = BambaConfig
if is_torch_available():
model_class = BambaModel
for_causal_lm_class = BambaForCausalLM
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
num_key_value_heads=2,
intermediate_size=64,
hidden_act="silu",
attention_dropout=0.0,
attn_layer_indices=None,
attn_rotary_emb=8,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
num_labels=3,
pad_token_id=0,
mamba_n_groups=1,
mamba_n_heads=16,
mamba_d_state=16,
mamba_d_conv=4,
mamba_expand=2,
mamba_chunk_size=16,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.attention_dropout = attention_dropout
self.attn_layer_indices = attn_layer_indices
self.attn_rotary_emb = attn_rotary_emb
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.pad_token_id = pad_token_id
self.scope = scope
self.mamba_n_groups = mamba_n_groups
self.mamba_n_heads = mamba_n_heads
self.mamba_d_state = mamba_d_state
self.mamba_d_conv = mamba_d_conv
self.mamba_expand = mamba_expand
self.mamba_chunk_size = mamba_chunk_size
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
token_labels = None
if self.use_labels:
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
self._update_layer_configs()
config = self.get_config()
return config, input_ids, input_mask, token_labels
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
input_mask,
token_labels,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
def _update_layer_configs(self):
"""Configures hidden layers and attn layer indices if they are not set."""
# Fix for SDPA tests, force at least 4 layers
if self.num_hidden_layers < 4:
self.num_hidden_layers = 4
if self.attn_layer_indices is None:
d = [x for x in range(2, self.num_hidden_layers) if self.num_hidden_layers % x == 0]
if len(d) == 0:
raise ValueError("num_hidden_layers is prime, cannot automatically set attn_layer_indices.")
d = d[-1] # get the largest divisor
self.attn_layer_indices = [x + 1 for x in range(0, self.num_hidden_layers, d)]
def get_config(self, **kwargs):
return self.config_class(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
num_key_value_heads=self.num_key_value_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
attention_dropout=self.attention_dropout,
attn_layer_indices=self.attn_layer_indices,
attn_rotary_emb=self.attn_rotary_emb,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
pad_token_id=self.pad_token_id,
mamba_n_groups=self.mamba_n_groups,
mamba_n_heads=self.mamba_n_heads,
mamba_d_state=self.mamba_d_state,
mamba_d_conv=self.mamba_d_conv,
mamba_expand=self.mamba_expand,
mamba_chunk_size=self.mamba_chunk_size,
**kwargs,
)
def create_and_check_model(
self,
config,
input_ids,
input_mask,
token_labels,
):
model = self.model_class(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_for_causal_lm(
self,
config,
input_ids,
input_mask,
token_labels,
):
model = self.for_causal_lm_class(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, labels=token_labels)
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids, labels=token_labels)
result = model(input_ids)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_decoder_model_past_large_inputs(
self,
config,
input_ids,
input_mask,
token_labels,
):
# config.is_decoder = True
# config.add_cross_attention = True
model = self.for_causal_lm_class(config=config)
model.to(torch_device)
model.eval()
# first forward pass
outputs = model(
input_ids,
attention_mask=input_mask,
use_cache=True,
)
past_key_values = outputs.past_key_values
# create hypothetical multiple next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
output_from_no_past = model(
next_input_ids,
attention_mask=next_attention_mask,
output_hidden_states=True,
)["hidden_states"][0]
output_from_past = model(
next_tokens,
attention_mask=next_attention_mask,
past_key_values=past_key_values,
output_hidden_states=True,
)["hidden_states"][0]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
@require_torch
class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
model_tester_class = BambaModelTester
all_model_classes = (BambaModel, BambaForCausalLM) if is_torch_available() else ()
pipeline_model_mapping = (
{
"feature-extraction": BambaModel,
"text-generation": BambaForCausalLM,
}
if is_torch_available()
else {}
)
# Need to use `0.8` instead of `0.9` for `test_cpu_offload`
# This is because we are hitting edge cases with the causal_mask buffer
model_split_percents = [0.5, 0.7, 0.8]
def _get_conv_state_shape(self, batch_size: int, config):
conv_shape = (
batch_size,
config.mamba_expand * config.hidden_size + 2 * config.mamba_n_groups * config.mamba_d_state,
config.mamba_d_conv,
)
return conv_shape
def _get_recurrent_state_shape(self, batch_size: int, config):
return (batch_size, config.mamba_n_heads, config.mamba_d_head, config.mamba_d_state)
def setUp(self):
self.model_tester = self.model_tester_class(self)
self.config_tester = ConfigTester(self, config_class=self.model_tester.config_class, hidden_size=64)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_causal_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
def test_decoder_model_past_with_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
def test_attention_outputs(self):
r"""
Overriding the test_attention_outputs test as the Bamba model outputs attention only for its attention layers
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
seq_len = getattr(self.model_tester, "seq_length", None)
encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
expected_num_attentions = self.model_tester.num_hidden_layers - len(self.model_tester.attn_layer_indices)
for model_class in self.all_model_classes:
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class._from_config(config, attn_implementation="eager")
config = model.config
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), expected_num_attentions)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), expected_num_attentions)
self.assertListEqual(
list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
)
out_len = len(outputs)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
added_hidden_states = 1
self.assertEqual(out_len + added_hidden_states, len(outputs))
self_attentions = outputs.attentions
self.assertEqual(len(self_attentions), expected_num_attentions)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
)
def test_batching_equivalence(self):
# need to disable the tril input mask
orig = self.model_tester.use_input_mask
self.model_tester.use_input_mask = False
super().test_batching_equivalence()
self.model_tester.use_input_mask = orig
@pytest.mark.generate
def test_left_padding_compatibility(self):
# TODO: document why a random attention mask causes this test to fail, but a full mask doesn't
unpadded_custom_inputs = {"attention_mask": None}
super().test_left_padding_compatibility(unpadded_custom_inputs=unpadded_custom_inputs)
@unittest.skip(
"Bamba requires additionally specifying position_ids, seq_idx, and FlashAttentionKwargs for padding-free training."
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip(
"Bamba requires additionally specifying position_ids, seq_idx, and FlashAttentionKwargs for padding-free training."
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self):
pass
@require_flash_attn
@require_torch_accelerator
@mark.flash_attn_test
@slow
@unittest.skip(
"NotImplementedError: seq_idx support requires fast path support. Please install mamba_ssm and causal_conv1d"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids_seq_idx_and_fa_kwargs(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
if not model_class._supports_flash_attn:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
if 0 not in inputs_dict.get("attention_mask", []) or "attention_mask" not in inputs_dict:
self.skipTest("Model dummy inputs should contain padding in their attention mask")
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
if "position_ids" not in inspect.signature(model.forward).parameters:
self.skipTest("Model does not support position_ids")
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
# ensure left padding, to adapt for some models
if 0 in inputs_dict["attention_mask"][:, -1]:
inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1)
dummy_attention_mask = inputs_dict["attention_mask"]
inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.get_text_config().pad_token_id
# Ensure inputs_dict also has labels in it, as their presence/absence can induce
# dtype conversions. This also lets us compare losses.
labels = inputs_dict["input_ids"].clone()
# Mask padding tokens
labels[~dummy_attention_mask.bool()] = -100
# Also need to mask the first non-trivial token to match the padding-free batch.
first_nonneg_idx = (labels >= 0).int().argmax(dim=1)
labels[torch.arange(labels.size(0), device=labels.device), first_nonneg_idx] = -100
inputs_dict["labels"] = labels
model = (
model_class.from_pretrained(
tmpdirname,
dtype=torch.float16,
attn_implementation="flash_attention_2",
)
.to(torch_device)
.eval()
)
# flatten
features = [
{"input_ids": i[a.bool()].tolist()}
for i, a in zip(inputs_dict["input_ids"], inputs_dict["attention_mask"])
]
# add position_ids + fa_kwargs + seq_idx
data_collator = DataCollatorWithFlattening(
return_tensors="pt", return_seq_idx=True, return_flash_attn_kwargs=True
)
batch = data_collator(features)
batch_accelerator = {k: t.to(torch_device) if torch.is_tensor(t) else t for k, t in batch.items()}
res_padded = model(**inputs_dict)
res_padfree = model(**batch_accelerator)
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]
torch.testing.assert_close(logits_padded.argmax(-1), logits_padfree.argmax(-1), rtol=0, atol=0)
# acceptable numerical instability
tol = torch.finfo(torch.float16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
loss_padded = res_padded.loss
loss_padfree = res_padfree.loss
torch.testing.assert_close(loss_padded, loss_padfree)
@slow
@require_torch
@require_torch_accelerator
class BambaModelIntegrationTest(unittest.TestCase):
model = None
tokenizer = None
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
device_properties: DeviceProperties = (None, None, None)
@classmethod
def setUpClass(cls):
model_id = "ibm-fms/Bamba-9B"
cls.model = BambaForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
# feels a bit forced to have to do this for the generation test
cls.tokenizer.pad_token_id = cls.model.config.pad_token_id
cls.tokenizer.padding_side = "left"
cls.device_properties = get_device_properties()
def test_simple_generate(self):
# fmt: off
expectations = Expectations(
{
("cuda", 8): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all having a good time.",
("rocm", 9): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
("xpu", 3): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. I am",
}
)
# fmt: on
self.model.to(torch_device)
input_ids = self.tokenizer("Hey how are you doing on this lovely evening?", return_tensors="pt")[
"input_ids"
].to(torch_device)
out = self.model.generate(input_ids, do_sample=False, max_new_tokens=10)
output_sentence = self.tokenizer.decode(out[0, :])
expected = expectations.get_expectation()
self.assertEqual(output_sentence, expected)
# TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist
if self.device_properties[0] == "cuda" and self.device_properties[1] == 8:
with torch.no_grad():
logits = self.model(input_ids=input_ids, logits_to_keep=40).logits
EXPECTED_LOGITS_NO_GRAD = torch.tensor(
[
149., 142., 146., 142., 143., 144., 142., 145.,
142., 146., 144., 146., 147., 147., 148., 145.,
147., 145., 145., 145., 145., 144., 144., 144.,
144., 145., 147., 146., 144., 144., 148., 147.,
148., 147., 147., 147., 146., 146., 148., 148.
], dtype=torch.bfloat16) # fmt: skip
torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1)
@require_deterministic_for_xpu
def test_simple_batched_generate_with_padding(self):
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
#
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
# considering differences in hardware processing and potential deviations in generated text.
# fmt: off
EXPECTED_TEXTS = Expectations(
{
("cuda", 7): [],
("cuda", 8): [
"<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
"!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
],
("rocm", 9): [
"<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
"!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I",
],
("xpu", 3): [
"<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all doing well. I am",
"!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
],
}
)
# fmt: on
EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
self.model.to(torch_device)
inputs = self.tokenizer(
["Hey how are you doing on this lovely evening?", "I am late! I need to"],
padding=True,
return_tensors="pt",
).to(torch_device)
out = self.model.generate(**inputs, do_sample=False, max_new_tokens=10)
output_sentences = self.tokenizer.batch_decode(out)
self.assertEqual(output_sentences[0], EXPECTED_TEXT[0])
self.assertEqual(output_sentences[1], EXPECTED_TEXT[1])
# TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist
if self.device_properties[0] == "cuda" and self.device_properties[1] == 8:
with torch.no_grad():
logits = self.model(input_ids=inputs["input_ids"]).logits
EXPECTED_LOGITS_NO_GRAD_0 = torch.tensor(
[
149., 142., 146., 142., 143., 144., 142., 145.,
142., 146., 144., 146., 147., 147., 148., 145.,
147., 145., 145., 145., 145., 144., 144., 144.,
144., 145., 147., 146., 144., 144., 148., 147.,
148., 147., 147., 147., 146., 146., 148., 148.
], dtype=torch.bfloat16) # fmt: skip
EXPECTED_LOGITS_NO_GRAD_1 = torch.tensor(
[
182., 178., 177., 174., 176., 176., 178., 178.,
177., 179., 176., 183., 180., 182., 179., 174.,
178., 176., 176., 175., 175., 175., 174., 173.,
174., 182., 180., 176., 177., 177., 180., 176.,
178., 177., 177., 175., 176., 177., 175., 177.
], dtype=torch.bfloat16) # fmt: skip
torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD_0, rtol=1e-3, atol=1)
torch.testing.assert_close(logits[1, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD_1, rtol=1e-3, atol=1)

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,238 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import tempfile
import unittest
import numpy as np
from transformers import AutoTokenizer, BarkProcessor
from transformers.testing_utils import require_torch, slow
@require_torch
class BarkProcessorTest(unittest.TestCase):
def setUp(self):
self.checkpoint = "suno/bark-small"
self.tmpdirname = tempfile.mkdtemp()
self.voice_preset = "en_speaker_1"
self.input_string = "This is a test string"
self.speaker_embeddings_dict_path = "speaker_embeddings_path.json"
self.speaker_embeddings_directory = "speaker_embeddings"
def get_tokenizer(self, **kwargs):
return AutoTokenizer.from_pretrained(self.checkpoint, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
processor = BarkProcessor(tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)
processor = BarkProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
@slow
def test_save_load_pretrained_additional_features(self):
processor = BarkProcessor.from_pretrained(
pretrained_processor_name_or_path=self.checkpoint,
speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
)
# TODO (ebezzam) not all speaker embedding are properly downloaded.
# My hypothesis: there are many files (~700 speaker embeddings) and some fail to download (not the same at different first runs)
# https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L89
# https://github.com/huggingface/transformers/blob/967045082faaaaf3d653bfe665080fd746b2bb60/src/transformers/models/bark/processing_bark.py#L188
# So for testing purposes, we will remove the unavailable speaker embeddings before saving.
processor._verify_speaker_embeddings(remove_unavailable=True)
processor.save_pretrained(
self.tmpdirname,
speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
speaker_embeddings_directory=self.speaker_embeddings_directory,
)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
processor = BarkProcessor.from_pretrained(
self.tmpdirname,
self.speaker_embeddings_dict_path,
bos_token="(BOS)",
eos_token="(EOS)",
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
def test_speaker_embeddings(self):
processor = BarkProcessor.from_pretrained(
pretrained_processor_name_or_path=self.checkpoint,
speaker_embeddings_dict_path=self.speaker_embeddings_dict_path,
)
seq_len = 35
nb_codebooks_coarse = 2
nb_codebooks_total = 8
voice_preset = {
"semantic_prompt": np.ones(seq_len),
"coarse_prompt": np.ones((nb_codebooks_coarse, seq_len)),
"fine_prompt": np.ones((nb_codebooks_total, seq_len)),
}
# test providing already loaded voice_preset
inputs = processor(text=self.input_string, voice_preset=voice_preset)
processed_voice_preset = inputs["history_prompt"]
for key in voice_preset:
self.assertListEqual(voice_preset[key].tolist(), processed_voice_preset.get(key, np.array([])).tolist())
# test loading voice preset from npz file
tmpfilename = os.path.join(self.tmpdirname, "file.npz")
np.savez(tmpfilename, **voice_preset)
inputs = processor(text=self.input_string, voice_preset=tmpfilename)
processed_voice_preset = inputs["history_prompt"]
for key in voice_preset:
self.assertListEqual(voice_preset[key].tolist(), processed_voice_preset.get(key, np.array([])).tolist())
# test loading voice preset from the hub
inputs = processor(text=self.input_string, voice_preset=self.voice_preset)
def test_speaker_embeddings_saving_rejects_path_traversal(self):
# A malicious speaker_embeddings_path.json dict key must not be usable to escape the save
# directory and write attacker-controlled content to an arbitrary path (path traversal, CWE-22).
tokenizer = self.get_tokenizer()
seq_len = 5
with tempfile.TemporaryDirectory() as tmp_dir_name:
# Plant the per-prompt npy files the malicious "repo" claims to provide so that
# `_load_voice_preset` succeeds and we reach the vulnerable `np.save` call.
np.save(os.path.join(tmp_dir_name, "s.npy"), np.ones(seq_len))
np.save(os.path.join(tmp_dir_name, "c.npy"), np.ones((2, seq_len)))
np.save(os.path.join(tmp_dir_name, "f.npy"), np.ones((8, seq_len)))
speaker_embeddings = {
"repo_or_path": tmp_dir_name,
"../../PWNED": {"semantic_prompt": "s.npy", "coarse_prompt": "c.npy", "fine_prompt": "f.npy"},
}
processor = BarkProcessor(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)
save_dir = os.path.join(tmp_dir_name, "save")
# Where the "../../PWNED" key would land if traversal succeeded.
canary = os.path.join(tmp_dir_name, "PWNED_semantic_prompt.npy")
with self.assertRaises(ValueError):
processor.save_pretrained(save_dir)
self.assertFalse(os.path.exists(canary))
def test_speaker_embeddings_saving_allows_subdirectories(self):
# Voice presets are legitimately stored in subdirectories (e.g. the `v2/...` presets in
# ylacombe/bark-large), so saving a nested key must be allowed - the guard rejects escapes only.
tokenizer = self.get_tokenizer()
seq_len = 5
with tempfile.TemporaryDirectory() as tmp_dir_name:
np.save(os.path.join(tmp_dir_name, "s.npy"), np.ones(seq_len))
np.save(os.path.join(tmp_dir_name, "c.npy"), np.ones((2, seq_len)))
np.save(os.path.join(tmp_dir_name, "f.npy"), np.ones((8, seq_len)))
speaker_embeddings = {
"repo_or_path": tmp_dir_name,
"v2/en_speaker_0": {"semantic_prompt": "s.npy", "coarse_prompt": "c.npy", "fine_prompt": "f.npy"},
}
processor = BarkProcessor(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)
save_dir = os.path.join(tmp_dir_name, "save")
processor.save_pretrained(save_dir)
self.assertTrue(
os.path.exists(os.path.join(save_dir, "speaker_embeddings", "v2", "en_speaker_0_semantic_prompt.npy"))
)
def test_load_voice_preset_rejects_path_traversal(self):
# The per-prompt paths in speaker_embeddings_path.json are also untrusted and are joined onto
# repo_or_path before being read, so a "../x" value must be rejected before the file is loaded.
tokenizer = self.get_tokenizer()
seq_len = 5
with tempfile.TemporaryDirectory() as tmp_dir_name:
repo_dir = os.path.join(tmp_dir_name, "repo")
os.makedirs(repo_dir)
# A readable .npy outside the repo dir that the traversal would otherwise reach.
np.save(os.path.join(tmp_dir_name, "PWNED.npy"), np.ones(seq_len))
speaker_embeddings = {
"repo_or_path": repo_dir,
"evil": {
"semantic_prompt": "../PWNED.npy",
"coarse_prompt": "../PWNED.npy",
"fine_prompt": "../PWNED.npy",
},
}
processor = BarkProcessor(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)
with self.assertRaises(ValueError):
processor._load_voice_preset("evil")
def test_load_voice_preset_allows_symlinked_cache_files(self):
# The path-traversal guard must be lexical, not symlink-resolving: the HF hub cache stores each
# snapshot file as a symlink into a sibling `blobs/` dir (snapshots/<rev>/f -> ../../blobs/<sha>),
# which sits outside repo_or_path. A resolve()/realpath()-based check would follow that symlink
# out of repo_or_path and wrongly reject a legitimate load, so loading must still succeed here.
tokenizer = self.get_tokenizer()
seq_len = 5
with tempfile.TemporaryDirectory() as tmp_dir_name:
# Mimic the hub cache layout: models--org--model/{blobs,snapshots/<rev>/...}.
repo_cache = os.path.join(tmp_dir_name, "models--dummy--bark")
blobs_dir = os.path.join(repo_cache, "blobs")
snapshot_dir = os.path.join(repo_cache, "snapshots", "deadbeef")
os.makedirs(blobs_dir)
os.makedirs(snapshot_dir)
arrays = {
"semantic_prompt": np.ones(seq_len),
"coarse_prompt": np.ones((2, seq_len)),
"fine_prompt": np.ones((8, seq_len)),
}
voice_preset_paths = {}
for key, array in arrays.items():
blob = os.path.join(blobs_dir, key) # real content lives in blobs/
np.save(blob, array, allow_pickle=False)
# ...and the snapshot exposes it as a relative symlink, exactly like the real cache.
link = os.path.join(snapshot_dir, f"{key}.npy")
os.symlink(os.path.relpath(blob + ".npy", snapshot_dir), link)
voice_preset_paths[key] = f"{key}.npy"
self.assertTrue(os.path.islink(os.path.join(snapshot_dir, "semantic_prompt.npy")))
speaker_embeddings = {"repo_or_path": snapshot_dir, "preset": voice_preset_paths}
processor = BarkProcessor(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings)
voice_preset = processor._load_voice_preset("preset")
for key, array in arrays.items():
self.assertTrue(np.array_equal(voice_preset[key], array))
def test_tokenizer(self):
tokenizer = self.get_tokenizer()
processor = BarkProcessor(tokenizer=tokenizer)
encoded_processor = processor(text=self.input_string)
encoded_tok = tokenizer(
self.input_string,
padding="max_length",
max_length=256,
add_special_tokens=False,
return_attention_mask=True,
return_token_type_ids=False,
)
for key in encoded_tok:
self.assertListEqual(encoded_tok[key], encoded_processor[key].squeeze().tolist())

View File

File diff suppressed because it is too large Load Diff

View File

View File

@@ -0,0 +1,42 @@
# Copyright 2019 Hugging Face inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import BarthezTokenizer
from transformers.testing_utils import require_sentencepiece, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
@require_sentencepiece
@require_tokenizers
class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "moussaKam/mbarthez"
tokenizer_class = BarthezTokenizer
integration_expected_tokens = ['▁This', '▁is', '▁a', '▁test', '', '😊', '▁I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '', '生活的真谛是', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '<s>', '▁hi', '<s>', '▁there', '▁The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁en', 'code', 'd', ':', '▁Hello', '.', '▁But', '▁ir', 'd', '▁and', '', 'ปี', '▁ir', 'd', '', '', '▁Hey', '▁how', '▁are', '▁you', '▁doing'] # fmt: skip
integration_expected_token_ids = [2078, 75, 10, 1938, 6, 3, 78, 402, 49997, 23, 387, 7648, 4, 124, 663, 75, 41564, 362, 5, 6, 3, 1739, 18324, 1739, 18324, 18324, 0, 901, 0, 1749, 451, 13564, 39363, 3354, 166, 72171, 22, 21077, 64, 12, 18324, 5, 3007, 172, 64, 124, 6, 3, 172, 64, 6, 3, 14833, 2271, 482, 329, 11028] # fmt: skip
expected_tokens_from_ids = ['▁This', '▁is', '▁a', '▁test', '', '<unk>', '▁I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '', '<unk>', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '<s>', '▁hi', '<s>', '▁there', '▁The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁en', 'code', 'd', ':', '▁Hello', '.', '▁But', '▁ir', 'd', '▁and', '', '<unk>', '▁ir', 'd', '', '<unk>', '▁Hey', '▁how', '▁are', '▁you', '▁doing'] # fmt: skip
integration_expected_decoded_text = "This is a test <unk> I was born in 92000, and this is falsé. <unk> Hi Hello Hi Hello Hello<s> hi<s> there The following string should be properly encoded: Hello. But ird and <unk> ird <unk> Hey how are you doing"
@classmethod
def setUpClass(cls):
super().setUpClass()
from_pretrained_id = "moussaKam/mbarthez"
tokenizer = BarthezTokenizer.from_pretrained(from_pretrained_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(cls.tmpdirname)

View File

View File

@@ -0,0 +1,78 @@
# Copyright 2021 HuggingFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tempfile
import unittest
from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer
from transformers.testing_utils import get_tests_dir
from ...test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "vinai/bartpho-syllable"
tokenizer_class = BartphoTokenizer
test_rust_tokenizer = False
test_sentencepiece = True
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.special_tokens_map = {"unk_token": "<unk>"}
@classmethod
def get_tokenizer(cls, pretrained_name=None, **kwargs):
"""Create a fresh tokenizer for each test instead of loading from saved."""
kwargs.update(cls.special_tokens_map)
# Create a temporary directory for this tokenizer
tmpdir = tempfile.mkdtemp()
vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
monolingual_vocab_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["monolingual_vocab_file"])
with open(monolingual_vocab_file, "w", encoding="utf-8") as fp:
fp.writelines(f"{token} {vocab_tokens[token]}\n" for token in vocab_tokens)
return BartphoTokenizer(SAMPLE_VOCAB, monolingual_vocab_file, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "This is a là test"
output_text = "This is a<unk><unk> test"
return input_text, output_text
def test_full_tokenizer(self):
vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
special_tokens_map = {"unk_token": "<unk>"}
with tempfile.TemporaryDirectory() as tmpdir:
monolingual_vocab_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["monolingual_vocab_file"])
with open(monolingual_vocab_file, "w", encoding="utf-8") as fp:
fp.writelines(f"{token} {vocab_tokens[token]}\n" for token in vocab_tokens)
tokenizer = BartphoTokenizer(SAMPLE_VOCAB, monolingual_vocab_file, **special_tokens_map)
text = "This is a là test"
bpe_tokens = "▁This ▁is ▁a ▁l à ▁t est".split()
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [4, 5, 6, 3, 3, 7, 8, 3]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

View File

View File

@@ -0,0 +1,309 @@
# Copyright 2021 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from datasets import load_dataset
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
class BeitImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_center_crop=True,
crop_size=None,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_reduce_labels=False,
):
size = size if size is not None else {"height": 20, "width": 20}
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_reduce_labels = do_reduce_labels
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_center_crop": self.do_center_crop,
"crop_size": self.crop_size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_reduce_labels": self.do_reduce_labels,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.crop_size["height"], self.crop_size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
def prepare_semantic_single_inputs():
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
example = ds[0]
return example["image"], example["map"]
def prepare_semantic_batch_inputs():
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
return list(ds["image"][:2]), list(ds["map"][:2])
@require_torch
@require_vision
class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = BeitImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_center_crop"))
self.assertTrue(hasattr(image_processing, "crop_size"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_reduce_labels"))
def test_image_processor_from_dict_with_kwargs(self):
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 20, "width": 20})
self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
self.assertEqual(image_processor.do_reduce_labels, False)
image_processor = image_processing_class.from_dict(
self.image_processor_dict, size=42, crop_size=84, do_reduce_labels=True
)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
self.assertEqual(image_processor.do_reduce_labels, True)
def test_call_segmentation_maps(self):
for image_processing_class in self.image_processing_classes.values():
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
maps = []
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
maps.append(torch.zeros(image.shape[-2:]).long())
# Test not batched input
encoding = image_processing(image_inputs[0], maps[0], return_tensors="pt")
self.assertEqual(
encoding["pixel_values"].shape,
(
1,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.assertEqual(
encoding["labels"].shape,
(
1,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.assertEqual(encoding["labels"].dtype, torch.long)
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
# Test batched
encoding = image_processing(image_inputs, maps, return_tensors="pt")
self.assertEqual(
encoding["pixel_values"].shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.assertEqual(
encoding["labels"].shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.assertEqual(encoding["labels"].dtype, torch.long)
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
# Test not batched input (PIL images)
image, segmentation_map = prepare_semantic_single_inputs()
encoding = image_processing(image, segmentation_map, return_tensors="pt")
self.assertEqual(
encoding["pixel_values"].shape,
(
1,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.assertEqual(
encoding["labels"].shape,
(
1,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.assertEqual(encoding["labels"].dtype, torch.long)
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
# Test batched input (PIL images)
images, segmentation_maps = prepare_semantic_batch_inputs()
encoding = image_processing(images, segmentation_maps, return_tensors="pt")
self.assertEqual(
encoding["pixel_values"].shape,
(
2,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.assertEqual(
encoding["labels"].shape,
(
2,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.assertEqual(encoding["labels"].dtype, torch.long)
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
def test_reduce_labels(self):
for image_processing_class in self.image_processing_classes.values():
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# ADE20k has 150 classes, and the background is included, so labels should be between 0 and 150
image, map = prepare_semantic_single_inputs()
encoding = image_processing(image, map, return_tensors="pt")
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 150)
image_processing.do_reduce_labels = True
encoding = image_processing(image, map, return_tensors="pt")
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
# Ensure reduce label returns the same number of masks
image, map = prepare_semantic_batch_inputs()
encoding = image_processing(image, map, return_tensors="pt")
self.assertTrue(len(encoding["labels"]) == len(map))
def test_backends_equivalence(self):
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
dummy_image, dummy_map = prepare_semantic_single_inputs()
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_image, segmentation_maps=dummy_map, return_tensors="pt")
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
reference_pixel_values = encodings[reference_backend].pixel_values
reference_labels = encodings[reference_backend].labels.float()
for backend_name in backend_names[1:]:
self._assert_tensors_equivalence(reference_pixel_values, encodings[backend_name].pixel_values)
self._assert_tensors_equivalence(reference_labels, encodings[backend_name].labels.float())
def test_backends_equivalence_batched(self):
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images, dummy_maps = prepare_semantic_batch_inputs()
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_images, segmentation_maps=dummy_maps, return_tensors="pt")
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
reference_pixel_values = encodings[reference_backend].pixel_values
reference_labels = encodings[reference_backend].labels.float()
for backend_name in backend_names[1:]:
self._assert_tensors_equivalence(reference_pixel_values, encodings[backend_name].pixel_values)
self._assert_tensors_equivalence(reference_labels, encodings[backend_name].labels.float())

View File

@@ -0,0 +1,571 @@
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch BEiT model."""
import unittest
from functools import cached_property
import pytest
from datasets import load_dataset
from transformers import BeitConfig
from transformers.testing_utils import (
require_torch,
require_torch_multi_gpu,
require_vision,
slow,
torch_device,
)
from transformers.utils import (
is_torch_available,
is_vision_available,
)
from ...test_backbone_common import BackboneTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from torch import nn
from transformers import (
BeitBackbone,
BeitForImageClassification,
BeitForMaskedImageModeling,
BeitForSemanticSegmentation,
BeitModel,
)
from transformers.models.auto.modeling_auto import MODEL_FOR_BACKBONE_MAPPING_NAMES, MODEL_MAPPING_NAMES
if is_vision_available():
from PIL import Image
from transformers import BeitImageProcessorPil
class BeitModelTester:
def __init__(
self,
parent,
vocab_size=100,
batch_size=13,
image_size=30,
patch_size=2,
num_channels=3,
is_training=True,
use_labels=True,
hidden_size=32,
num_hidden_layers=4,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
type_sequence_label_size=10,
initializer_range=0.02,
num_labels=3,
scope=None,
out_indices=[1, 2, 3, 4],
out_features=["stage1", "stage2", "stage3", "stage4"],
attn_implementation="eager",
mask_ratio=0.5,
use_relative_position_bias=False,
use_shared_relative_position_bias=False,
add_fpn=True,
):
self.parent = parent
self.vocab_size = vocab_size
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.is_training = is_training
self.use_labels = use_labels
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.scope = scope
self.out_indices = out_indices
self.out_features = out_features
self.num_labels = num_labels
self.add_fpn = add_fpn
# in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches + 1
self.mask_length = self.seq_length - 1
self.num_masks = int(mask_ratio * self.seq_length)
self.attn_implementation = attn_implementation
self.use_relative_position_bias = use_relative_position_bias
self.use_shared_relative_position_bias = use_shared_relative_position_bias
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
pixel_labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
config = self.get_config()
return config, pixel_values, labels, pixel_labels
def get_config(self):
return BeitConfig(
vocab_size=self.vocab_size,
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
is_decoder=False,
initializer_range=self.initializer_range,
out_indices=self.out_indices,
out_features=self.out_features,
attn_implementation=self.attn_implementation,
use_relative_position_bias=self.use_relative_position_bias,
use_shared_relative_position_bias=self.use_shared_relative_position_bias,
add_fpn=self.add_fpn,
)
def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
model = BeitModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_backbone(self, config, pixel_values, labels, pixel_labels):
config.add_fpn = False
model = BeitBackbone(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
# verify hidden states
self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
expected_height = expected_width = self.image_size // config.patch_size
self.parent.assertListEqual(
list(result.feature_maps[0].shape), [self.batch_size, self.hidden_size, expected_height, expected_width]
)
# verify channels
self.parent.assertEqual(len(model.channels), len(config.out_features))
# verify backbone works with out_features=None
config.out_features = None
model = BeitBackbone(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
# verify feature maps
self.parent.assertEqual(len(result.feature_maps), 1)
self.parent.assertListEqual(
list(result.feature_maps[0].shape), [self.batch_size, self.hidden_size, expected_height, expected_width]
)
# verify channels
self.parent.assertEqual(len(model.channels), 1)
def create_and_check_for_masked_lm(self, config, pixel_values, labels, pixel_labels):
model = BeitForMaskedImageModeling(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length - 1, self.vocab_size))
def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
config.num_labels = self.type_sequence_label_size
model = BeitForImageClassification(config)
model.to(torch_device)
model.eval()
result = model(pixel_values, labels=labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
# test greyscale images
config.num_channels = 1
model = BeitForImageClassification(config)
model.to(torch_device)
model.eval()
pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
result = model(pixel_values, labels=labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels):
config.num_labels = self.num_labels
model = BeitForSemanticSegmentation(config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(
result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
)
result = model(pixel_values, labels=pixel_labels)
self.parent.assertEqual(
result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, labels, pixel_labels = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as BEiT does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (
(
BeitModel,
BeitForImageClassification,
BeitForMaskedImageModeling,
BeitForSemanticSegmentation,
BeitBackbone,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{
"image-feature-extraction": BeitModel,
"image-classification": BeitForImageClassification,
"image-segmentation": BeitForSemanticSegmentation,
}
if is_torch_available()
else {}
)
test_resize_embeddings = False
def setUp(self):
self.model_tester = BeitModelTester(self)
self.config_tester = ConfigTester(self, config_class=BeitConfig, has_text_modality=False, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="BEiT does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@require_torch_multi_gpu
@unittest.skip(reason="BEiT has some layers using `add_module` which doesn't work well with `nn.DataParallel`")
def test_multi_gpu_data_parallel_forward(self):
pass
@unittest.skip(reason="BEiT does not support feedforward chunking yet")
def test_feed_forward_chunking(self):
pass
@unittest.skip(reason="BEiT can't compile dynamic")
@pytest.mark.torch_compile_test
def test_sdpa_can_compile_dynamic(self):
pass
def test_model_get_set_embeddings(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_backbone(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_backbone(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
def test_for_image_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
def test_for_semantic_segmentation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
def test_training(self):
if not self.model_tester.is_training:
self.skipTest(reason="model_tester.is_training is set to False")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
for model_class in self.all_model_classes:
# we don't test BeitForMaskedImageModeling
if model_class.__name__ in [
*MODEL_MAPPING_NAMES.values(),
*MODEL_FOR_BACKBONE_MAPPING_NAMES.values(),
"BeitForMaskedImageModeling",
]:
continue
model = model_class(config)
model.to(torch_device)
model.train()
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
loss = model(**inputs).loss
loss.backward()
def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
if not self.model_tester.is_training:
self.skipTest(reason="model_tester.is_training is set to False")
config.use_cache = False
config.return_dict = True
for model_class in self.all_model_classes:
# we don't test BeitForMaskedImageModeling
if (
model_class.__name__
in [
*MODEL_MAPPING_NAMES.values(),
*MODEL_FOR_BACKBONE_MAPPING_NAMES.values(),
"BeitForMaskedImageModeling",
]
or not model_class.supports_gradient_checkpointing
):
continue
model = model_class(config)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
model.to(torch_device)
model.train()
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
loss = model(**inputs).loss
loss.backward()
def test_reverse_loading_mapping(self):
# Enable both per-layer and shared relative position bias so that every
# mapping rule has at least one matching key in the model state dict.
self.model_tester.use_relative_position_bias = True
self.model_tester.use_shared_relative_position_bias = True
try:
super().test_reverse_loading_mapping()
finally:
self.model_tester.use_relative_position_bias = False
self.model_tester.use_shared_relative_position_bias = False
@slow
def test_model_from_pretrained(self):
model_name = "microsoft/beit-base-patch16-224"
model = BeitModel.from_pretrained(model_name)
self.assertIsNotNone(model)
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_torch
@require_vision
class BeitModelIntegrationTest(unittest.TestCase):
@cached_property
def default_image_processor(self):
return (
BeitImageProcessorPil.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
)
@slow
def test_inference_masked_image_modeling_head(self):
model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
# prepare bool_masked_pos
bool_masked_pos = torch.ones((1, 196), dtype=torch.bool).to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(pixel_values=pixel_values, bool_masked_pos=bool_masked_pos)
logits = outputs.logits
# verify the logits
expected_shape = torch.Size((1, 196, 8192))
self.assertEqual(logits.shape, expected_shape)
expected_slice = torch.tensor(
[[-3.2437, 0.5072, -13.9174], [-3.2456, 0.4948, -13.9401], [-3.2033, 0.5121, -13.8550]]
).to(torch_device)
torch.testing.assert_close(logits[bool_masked_pos][:3, :3], expected_slice, rtol=1e-2, atol=1e-2)
@slow
def test_inference_image_classification_head_imagenet_1k(self):
model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
# verify the logits
expected_shape = torch.Size((1, 1000))
self.assertEqual(logits.shape, expected_shape)
expected_slice = torch.tensor([-1.2385, -1.0987, -1.0108]).to(torch_device)
torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
expected_class_idx = 281
self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
@slow
def test_inference_image_classification_head_imagenet_22k(self):
model = BeitForImageClassification.from_pretrained("microsoft/beit-large-patch16-224-pt22k-ft22k").to(
torch_device
)
image_processor = self.default_image_processor
image = prepare_img()
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
# verify the logits
expected_shape = torch.Size((1, 21841))
self.assertEqual(logits.shape, expected_shape)
expected_slice = torch.tensor([1.6881, -0.2787, 0.5901]).to(torch_device)
torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
expected_class_idx = 2396
self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
@slow
def test_inference_semantic_segmentation(self):
model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
model = model.to(torch_device)
image_processor = BeitImageProcessorPil(do_resize=True, size=640, do_center_crop=False)
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
image = ds[0]["image"].convert("RGB")
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
# verify the logits
expected_shape = torch.Size((1, 150, 160, 160))
self.assertEqual(logits.shape, expected_shape)
expected_slice = torch.tensor(
[
[[-4.8960, -2.3688, -3.0355], [-2.8479, -0.9836, -1.7418], [-2.9449, -1.3333, -2.1456]],
[[-5.8081, -3.4124, -4.1006], [-3.8561, -2.2081, -3.0323], [-3.8365, -2.4601, -3.3669]],
[[-0.0309, 3.9868, 4.0540], [2.9640, 4.6877, 4.9976], [3.2081, 4.7690, 4.9942]],
],
device=torch_device,
)
torch.testing.assert_close(logits[0, :3, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
@slow
def test_post_processing_semantic_segmentation(self):
model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
model = model.to(torch_device)
image_processor = BeitImageProcessorPil(do_resize=True, size=640, do_center_crop=False)
ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
image = ds[0]["image"].convert("RGB")
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
outputs.logits = outputs.logits.detach().cpu()
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
expected_shape = torch.Size((500, 300))
self.assertEqual(segmentation[0].shape, expected_shape)
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
expected_shape = torch.Size((160, 160))
self.assertEqual(segmentation[0].shape, expected_shape)
@slow
def test_inference_interpolate_pos_encoding(self):
model_name = "microsoft/beit-base-patch16-224-pt22k"
model = BeitModel.from_pretrained(model_name, **{"use_absolute_position_embeddings": True}).to(torch_device)
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
processor = BeitImageProcessorPil.from_pretrained(model_name)
inputs = processor(images=image, return_tensors="pt", size={"height": 480, "width": 480})
pixel_values = inputs.pixel_values.to(torch_device)
# with interpolate_pos_encoding being True the model should process the higher resolution image
# successfully and produce the expected output.
with torch.no_grad():
outputs = model(pixel_values, interpolate_pos_encoding=True)
# num_cls_tokens + (height / patch_size) * (width / patch_size)
# 1 + (480 / 16) * (480 / 16) = 1 + 30 * 30 = 901
expected_shape = torch.Size((1, 901, 768))
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
@require_torch
class BeitBackboneTest(unittest.TestCase, BackboneTesterMixin):
all_model_classes = (BeitBackbone,) if is_torch_available() else ()
config_class = BeitConfig
def setUp(self):
self.model_tester = BeitModelTester(self, add_fpn=False)

View File

View File

@@ -0,0 +1,582 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import BertConfig, is_torch_available
from transformers.models.auto import get_values
from transformers.testing_utils import (
require_torch,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
MODEL_FOR_PRETRAINING_MAPPING,
BertForMaskedLM,
BertForMultipleChoice,
BertForNextSentencePrediction,
BertForPreTraining,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
BertLMHeadModel,
BertModel,
)
class BertModelTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = self.get_config()
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def get_config(self):
"""
Returns a tiny configuration by default.
"""
return BertConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
is_decoder=False,
initializer_range=self.initializer_range,
)
def prepare_config_and_inputs_for_decoder(self):
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = self.prepare_config_and_inputs()
config.is_decoder = True
encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
return (
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
)
def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BertModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
result = model(input_ids, token_type_ids=token_type_ids)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def create_and_check_model_as_decoder(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
):
config.add_cross_attention = True
model = BertModel(config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
)
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
encoder_hidden_states=encoder_hidden_states,
)
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def create_and_check_for_causal_lm(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
):
model = BertLMHeadModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BertForMaskedLM(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_model_for_causal_lm_as_decoder(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
):
config.add_cross_attention = True
model = BertLMHeadModel(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
labels=token_labels,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
)
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
labels=token_labels,
encoder_hidden_states=encoder_hidden_states,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_decoder_model_past_large_inputs(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
):
config.is_decoder = True
config.add_cross_attention = True
model = BertLMHeadModel(config=config).to(torch_device).eval()
# first forward pass
outputs = model(
input_ids,
attention_mask=input_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=True,
)
past_key_values = outputs.past_key_values
# create hypothetical multiple next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
output_from_no_past = model(
next_input_ids,
attention_mask=next_attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_hidden_states=True,
)["hidden_states"][0]
output_from_past = model(
next_tokens,
attention_mask=next_attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
output_hidden_states=True,
)["hidden_states"][0]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_for_next_sequence_prediction(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BertForNextSentencePrediction(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
labels=sequence_labels,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
def create_and_check_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BertForPreTraining(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
labels=token_labels,
next_sentence_label=sequence_labels,
)
self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
def create_and_check_for_question_answering(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BertForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
start_positions=sequence_labels,
end_positions=sequence_labels,
)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def create_and_check_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = BertForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_token_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = BertForTokenClassification(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def create_and_check_for_multiple_choice(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = BertForMultipleChoice(config=config)
model.to(torch_device)
model.eval()
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
result = model(
multiple_choice_inputs_ids,
attention_mask=multiple_choice_input_mask,
token_type_ids=multiple_choice_token_type_ids,
labels=choice_labels,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(
BertModel,
BertLMHeadModel,
BertForMaskedLM,
BertForMultipleChoice,
BertForNextSentencePrediction,
BertForPreTraining,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{
"feature-extraction": BertModel,
"fill-mask": BertForMaskedLM,
"text-classification": BertForSequenceClassification,
"text-generation": BertLMHeadModel,
"token-classification": BertForTokenClassification,
"zero-shot": BertForSequenceClassification,
}
if is_torch_available()
else {}
)
model_split_percents = [0.5, 0.8, 0.9]
# special case for ForPreTraining model
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
if return_labels:
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
)
inputs_dict["next_sentence_label"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device
)
return inputs_dict
# Overwriting to add `is_decoder` flag
def prepare_config_and_inputs_for_generate(self, batch_size=2):
config, inputs = super().prepare_config_and_inputs_for_generate(batch_size)
config.is_decoder = True
return config, inputs
def setUp(self):
self.model_tester = BertModelTester(self)
self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_model_as_decoder(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
def test_model_as_decoder_with_default_input_mask(self):
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
) = self.model_tester.prepare_config_and_inputs_for_decoder()
input_mask = None
self.model_tester.create_and_check_model_as_decoder(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
)
def test_for_causal_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
def test_for_causal_lm_decoder(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs)
def test_decoder_model_past_with_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
def test_for_next_sequence_prediction(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
def test_for_pretraining(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
model = BertModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch
class BertModelIntegrationTest(unittest.TestCase):
@slow
def test_inference_no_head_absolute_embedding(self):
model = BertModel.from_pretrained("google-bert/bert-base-uncased")
input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
with torch.no_grad():
output = model(input_ids, attention_mask=attention_mask)[0]
expected_shape = torch.Size((1, 11, 768))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor([[[0.4249, 0.1008, 0.7531], [0.3771, 0.1188, 0.7467], [0.4152, 0.1098, 0.7108]]])
torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4)

View File

@@ -0,0 +1,34 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.models.bert.tokenization_bert import (
BertTokenizer,
)
from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = ["google-bert/bert-base-uncased"]
tokenizer_class = BertTokenizer
integration_expected_tokens = ['[UNK]', 'is', 'a', 'test', '[UNK]', '[UNK]', 'was', 'born', 'in', '92', '##00', '##0', ',', 'and', 'this', 'is', '[UNK]', '.', '', '[UNK]', '', '', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '<', 's', '>', 'hi', '<', 's', '>', 'there', '[UNK]', 'following', 'string', 'should', 'be', 'properly', 'encoded', ':', '[UNK]', '.', '[UNK]', 'ir', '##d', 'and', '[UNK]', 'ir', '##d', '[UNK]', '[UNK]', 'how', 'are', 'you', 'doing'] # fmt: skip
integration_expected_token_ids = [100, 2003, 1037, 3231, 100, 100, 2001, 2141, 1999, 6227, 8889, 2692, 1010, 1998, 2023, 2003, 100, 1012, 1910, 100, 1916, 1921, 100, 100, 100, 100, 100, 100, 100, 1026, 1055, 1028, 7632, 1026, 1055, 1028, 2045, 100, 2206, 5164, 2323, 2022, 7919, 12359, 1024, 100, 1012, 100, 20868, 2094, 1998, 100, 20868, 2094, 100, 100, 2129, 2024, 2017, 2725] # fmt: skip
expected_tokens_from_ids = ['[UNK]', 'is', 'a', 'test', '[UNK]', '[UNK]', 'was', 'born', 'in', '92', '##00', '##0', ',', 'and', 'this', 'is', '[UNK]', '.', '', '[UNK]', '', '', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '<', 's', '>', 'hi', '<', 's', '>', 'there', '[UNK]', 'following', 'string', 'should', 'be', 'properly', 'encoded', ':', '[UNK]', '.', '[UNK]', 'ir', '##d', 'and', '[UNK]', 'ir', '##d', '[UNK]', '[UNK]', 'how', 'are', 'you', 'doing'] # fmt: skip
integration_expected_decoded_text = "[UNK] is a test [UNK] [UNK] was born in 92000, and this is [UNK]. 生 [UNK] 的 真 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] < s > hi < s > there [UNK] following string should be properly encoded : [UNK]. [UNK] ird and [UNK] ird [UNK] [UNK] how are you doing"

View File

View File

@@ -0,0 +1,345 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import BertGenerationConfig, is_torch_available
from transformers.testing_utils import require_torch, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import BertGenerationDecoder, BertGenerationEncoder
class BertGenerationEncoderTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=50,
initializer_range=0.02,
use_labels=True,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.use_labels = use_labels
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if self.use_labels:
token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
config = self.get_config()
return config, input_ids, input_mask, token_labels
def get_config(self):
return BertGenerationConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
is_decoder=False,
initializer_range=self.initializer_range,
)
def prepare_config_and_inputs_for_decoder(self):
(
config,
input_ids,
input_mask,
token_labels,
) = self.prepare_config_and_inputs()
config.is_decoder = True
encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
return (
config,
input_ids,
input_mask,
token_labels,
encoder_hidden_states,
encoder_attention_mask,
)
def create_and_check_model(
self,
config,
input_ids,
input_mask,
token_labels,
**kwargs,
):
model = BertGenerationEncoder(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_model_as_decoder(
self,
config,
input_ids,
input_mask,
token_labels,
encoder_hidden_states,
encoder_attention_mask,
**kwargs,
):
config.add_cross_attention = True
model = BertGenerationEncoder(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
)
result = model(
input_ids,
attention_mask=input_mask,
encoder_hidden_states=encoder_hidden_states,
)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_decoder_model_past_large_inputs(
self,
config,
input_ids,
input_mask,
token_labels,
encoder_hidden_states,
encoder_attention_mask,
**kwargs,
):
config.is_decoder = True
config.add_cross_attention = True
model = BertGenerationDecoder(config=config).to(torch_device).eval()
# first forward pass
outputs = model(
input_ids,
attention_mask=input_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=True,
)
past_key_values = outputs.past_key_values
# create hypothetical multiple next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
output_from_no_past = model(
next_input_ids,
attention_mask=next_attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_hidden_states=True,
)["hidden_states"][0]
output_from_past = model(
next_tokens,
attention_mask=next_attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
output_hidden_states=True,
)["hidden_states"][0]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_for_causal_lm(
self,
config,
input_ids,
input_mask,
token_labels,
*args,
):
model = BertGenerationDecoder(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def prepare_config_and_inputs_for_common(self):
config, input_ids, input_mask, token_labels = self.prepare_config_and_inputs()
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (BertGenerationEncoder, BertGenerationDecoder) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": BertGenerationEncoder, "text-generation": BertGenerationDecoder}
if is_torch_available()
else {}
)
# Overwriting to add `is_decoder` flag
def prepare_config_and_inputs_for_generate(self, batch_size=2):
config, inputs = super().prepare_config_and_inputs_for_generate(batch_size)
config.is_decoder = True
return config, inputs
def setUp(self):
self.model_tester = BertGenerationEncoderTester(self)
self.config_tester = ConfigTester(self, config_class=BertGenerationConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_model_as_bert(self):
config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs()
config.model_type = "bert"
self.model_tester.create_and_check_model(config, input_ids, input_mask, token_labels)
def test_model_as_decoder(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
def test_decoder_model_past_with_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
def test_model_as_decoder_with_default_input_mask(self):
(
config,
input_ids,
input_mask,
token_labels,
encoder_hidden_states,
encoder_attention_mask,
) = self.model_tester.prepare_config_and_inputs_for_decoder()
input_mask = None
self.model_tester.create_and_check_model_as_decoder(
config,
input_ids,
input_mask,
token_labels,
encoder_hidden_states,
encoder_attention_mask,
)
def test_for_causal_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
self.assertIsNotNone(model)
@require_torch
class BertGenerationEncoderIntegrationTest(unittest.TestCase):
@slow
def test_inference_no_head_absolute_embedding(self):
model = BertGenerationEncoder.from_pretrained(
"google/bert_for_seq_generation_L-24_bbc_encoder", attn_implementation="eager"
)
input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
with torch.no_grad():
output = model(input_ids)[0]
expected_shape = torch.Size([1, 8, 1024])
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[[0.1775, 0.0083, -0.0321], [1.6002, 0.1287, 0.3912], [2.1473, 0.5791, 0.6066]]]
)
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
@require_torch
class BertGenerationDecoderIntegrationTest(unittest.TestCase):
@slow
def test_inference_no_head_absolute_embedding(self):
model = BertGenerationDecoder.from_pretrained(
"google/bert_for_seq_generation_L-24_bbc_encoder", attn_implementation="eager"
)
input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
with torch.no_grad():
output = model(input_ids)[0]
expected_shape = torch.Size([1, 8, 50358])
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[[-0.5788, -2.5994, -3.7054], [0.0438, 4.7997, 1.8795], [1.5862, 6.6409, 4.4638]]]
)
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)

View File

@@ -0,0 +1,241 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from functools import cached_property
from transformers import BertGenerationTokenizer
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin
SPIECE_UNDERLINE = ""
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece
class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/bert_for_seq_generation_L-24_bbc_encoder"
tokenizer_class = BertGenerationTokenizer
test_rust_tokenizer = False
test_sentencepiece = True
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
token = "<s>"
token_id = 1
self.assertEqual(self.get_tokenizer()._convert_token_to_id_with_added_voc(token), token_id)
self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
def test_get_vocab(self):
vocab_keys = list(self.get_tokenizer().get_vocab().keys())
self.assertEqual(vocab_keys[0], "<unk>")
self.assertEqual(vocab_keys[1], "<s>")
self.assertEqual(vocab_keys[-1], "<pad>")
self.assertEqual(len(vocab_keys), 1_002)
def test_vocab_size(self):
self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
def test_full_tokenizer(self):
tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokens = tokenizer.tokenize("This is a test")
self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens),
[285, 46, 10, 170, 382],
)
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
self.assertListEqual(
tokens,
[
SPIECE_UNDERLINE + "I",
SPIECE_UNDERLINE + "was",
SPIECE_UNDERLINE + "b",
"or",
"n",
SPIECE_UNDERLINE + "in",
SPIECE_UNDERLINE + "",
"9",
"2",
"0",
"0",
"0",
",",
SPIECE_UNDERLINE + "and",
SPIECE_UNDERLINE + "this",
SPIECE_UNDERLINE + "is",
SPIECE_UNDERLINE + "f",
"al",
"s",
"é",
".",
],
)
ids = tokenizer.convert_tokens_to_ids(tokens)
self.assertListEqual(
ids,
[8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
)
back_tokens = tokenizer.convert_ids_to_tokens(ids)
self.assertListEqual(
back_tokens,
[
SPIECE_UNDERLINE + "I",
SPIECE_UNDERLINE + "was",
SPIECE_UNDERLINE + "b",
"or",
"n",
SPIECE_UNDERLINE + "in",
SPIECE_UNDERLINE + "",
"<unk>",
"2",
"0",
"0",
"0",
",",
SPIECE_UNDERLINE + "and",
SPIECE_UNDERLINE + "this",
SPIECE_UNDERLINE + "is",
SPIECE_UNDERLINE + "f",
"al",
"s",
"<unk>",
".",
],
)
@cached_property
def big_tokenizer(self):
return BertGenerationTokenizer.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
@slow
def test_tokenization_base_easy_symbols(self):
symbols = "Hello World!"
original_tokenizer_encodings = [18536, 2260, 101]
self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
# @slow
def test_tokenization_base_hard_symbols(self):
symbols = (
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
)
original_tokenizer_encodings = [
871,
419,
358,
946,
991,
2521,
452,
358,
1357,
387,
7751,
3536,
112,
985,
456,
126,
865,
938,
5400,
5734,
458,
1368,
467,
786,
2462,
5246,
1159,
633,
865,
4519,
457,
582,
852,
2557,
427,
916,
508,
2253,
391,
408,
11342,
1244,
385,
100,
938,
985,
456,
574,
362,
12597,
3200,
3129,
1172,
]
self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
@require_torch
@slow
def test_torch_encode_plus_sent_to_model(self):
import torch
from transformers import BertGenerationConfig, BertGenerationEncoder
# Build sequence
first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10]
sequence = " ".join(first_ten_tokens)
encoded_sequence = self.big_tokenizer(sequence, return_tensors="pt", return_token_type_ids=False)
batch_encoded_sequence = self.big_tokenizer(
[sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False
)
config = BertGenerationConfig()
model = BertGenerationEncoder(config)
assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size
with torch.no_grad():
model(**encoded_sequence)
model(**batch_encoded_sequence)
@slow
def test_tokenizer_integration(self):
expected_encoding = {'input_ids': [[39286, 458, 36335, 2001, 456, 13073, 13266, 455, 113, 7746, 1741, 11157, 391, 13073, 13266, 455, 113, 3967, 35412, 113, 4936, 109, 3870, 2377, 113, 30084, 45720, 458, 134, 17496, 112, 503, 11672, 113, 118, 112, 5665, 13347, 38687, 112, 1496, 31389, 112, 3268, 47264, 134, 962, 112, 16377, 8035, 23130, 430, 12169, 15518, 28592, 458, 146, 41697, 109, 391, 12169, 15518, 16689, 458, 146, 41358, 109, 452, 726, 4034, 111, 763, 35412, 5082, 388, 1903, 111, 9051, 391, 2870, 48918, 1900, 1123, 550, 998, 112, 9586, 15985, 455, 391, 410, 22955, 37636, 114], [448, 17496, 419, 3663, 385, 763, 113, 27533, 2870, 3283, 13043, 1639, 24713, 523, 656, 24013, 18550, 2521, 517, 27014, 21244, 420, 1212, 1465, 391, 927, 4833, 388, 578, 11786, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [484, 2169, 7687, 21932, 18146, 726, 363, 17032, 3391, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip
self.tokenizer_integration_test_util(
expected_encoding=expected_encoding,
model_name="google/bert_for_seq_generation_L-24_bbc_encoder",
revision="c817d1fd1be2ffa69431227a1fe320544943d4db",
)

View File

View File

@@ -0,0 +1,448 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import tempfile
import unittest
from transformers import AutoTokenizer
from transformers.models.bert_japanese.tokenization_bert_japanese import (
VOCAB_FILES_NAMES,
BertJapaneseTokenizer,
CharacterTokenizer,
JumanppTokenizer,
MecabTokenizer,
SudachiTokenizer,
WordpieceTokenizer,
)
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection
from ...test_tokenization_common import TokenizerTesterMixin
@custom_tokenizers
class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "cl-tohoku/bert-base-japanese"
tokenizer_class = BertJapaneseTokenizer
test_rust_tokenizer = False
space_between_special_tokens = True
@classmethod
def setUpClass(cls):
super().setUpClass()
# Create a separate temp directory for the vocab file to avoid conflicts
# with files saved by the base class setUpClass (e.g., tokenizer_config.json, added_tokens.json)
cls.vocab_tmpdirname = tempfile.mkdtemp()
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"こんにちは",
"こん",
"にちは",
"ばんは",
"##こん",
"##にちは",
"##ばんは",
"世界",
"##世界",
"",
"##、",
"",
"##。",
"アップルストア",
"外国",
"##人",
"参政",
"##権",
"此れ",
"",
"",
"です",
]
cls.vocab_file = os.path.join(cls.vocab_tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
@classmethod
def get_tokenizer(cls, pretrained_name=None, **kwargs):
"""Override to use vocab_tmpdirname instead of tmpdirname to avoid conflicts with saved tokenizer files."""
pretrained_name = pretrained_name or cls.vocab_tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
@classmethod
def tearDownClass(cls):
super().tearDownClass()
if hasattr(cls, "vocab_tmpdirname"):
shutil.rmtree(cls.vocab_tmpdirname, ignore_errors=True)
def get_input_output_texts(self, tokenizer):
input_text = "こんにちは、世界。 \nこんばんは、世界。"
output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"
return input_text, output_text
def get_clean_sequence(self, tokenizer):
input_text, output_text = self.get_input_output_texts(tokenizer)
ids = tokenizer.encode(output_text, add_special_tokens=False)
text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
return text, ids
def test_pretokenized_inputs(self):
pass # TODO add if relevant
def test_maximum_encoding_length_pair_input(self):
pass # TODO add if relevant
def test_maximum_encoding_length_single_input(self):
pass # TODO add if relevant
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file)
tokens = tokenizer.tokenize("こんにちは、世界。\nこんばんは、世界。")
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
def test_mecab_full_tokenizer_with_mecab_kwargs(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"}
)
text = "アップルストア"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["アップルストア"])
def test_mecab_tokenizer_ipadic(self):
tokenizer = MecabTokenizer(mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_unidic_lite(self):
try:
tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
except ModuleNotFoundError:
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_unidic(self):
try:
import unidic
self.assertTrue(
os.path.isdir(unidic.DICDIR),
"The content of unidic was not downloaded. Run `python -m unidic download` before running this test case. Note that this requires 2.1GB on disk.",
)
tokenizer = MecabTokenizer(mecab_dic="unidic")
except ModuleNotFoundError:
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_lower(self):
tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iphone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_with_option(self):
try:
tokenizer = MecabTokenizer(
do_lower_case=True, normalize_text=False, mecab_option="-d /usr/local/lib/mecab/dic/jumandic"
)
except RuntimeError:
# if dict doesn't exist in the system, previous code raises this error.
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "", "", "発売", "", "れた", "\u3000", ""],
)
def test_mecab_tokenizer_no_normalize(self):
tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "", "", "発売", "", "", "", " ", ""],
)
@require_sudachi_projection
def test_sudachi_tokenizer_core(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core")
# fmt: off
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
[" ", "\t", "アップル", "ストア", "", "iPhone", "8", " ", "", " ", " ", "\n ", "発売", "", "", "", " ", "", " ", " "],
)
# fmt: on
@require_sudachi_projection
def test_sudachi_tokenizer_split_mode_A(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "", "参政", ""])
@require_sudachi_projection
def test_sudachi_tokenizer_split_mode_B(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"])
@require_sudachi_projection
def test_sudachi_tokenizer_split_mode_C(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"])
@require_sudachi_projection
def test_sudachi_full_tokenizer_with_sudachi_kwargs_split_mode_B(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_split_mode": "B"}
)
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "##人", "参政", "##権"])
@require_sudachi_projection
def test_sudachi_tokenizer_projection(self):
tokenizer = SudachiTokenizer(
sudachi_dict_type="core", sudachi_split_mode="A", sudachi_projection="normalized_nouns"
)
self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "", "", "です", ""])
@require_sudachi_projection
def test_sudachi_full_tokenizer_with_sudachi_kwargs_sudachi_projection(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_projection": "normalized_nouns"}
)
self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "", "", "です", ""])
@require_sudachi_projection
def test_sudachi_tokenizer_lower(self):
tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core")
self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),[" ", "\t", "アップル", "ストア", "", "iphone", "8", " ", "", " ", " ", "\n ", "発売", "", "", "", " ", "", " ", " "]) # fmt: skip
@require_sudachi_projection
def test_sudachi_tokenizer_no_normalize(self):
tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core")
self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),[" ", "\t", "アップル", "ストア", "", "iPhone", "", " ", "", " ", " ", "\n ", "発売", "", "", "", "\u3000", "", " ", " "]) # fmt: skip
@require_sudachi_projection
def test_sudachi_tokenizer_trim_whitespace(self):
tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
@require_jumanpp
def test_jumanpp_tokenizer(self):
tokenizer = JumanppTokenizer()
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),["アップル", "ストア", "", "iPhone", "8", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""]) # fmt: skip
@require_jumanpp
def test_jumanpp_tokenizer_lower(self):
tokenizer = JumanppTokenizer(do_lower_case=True)
self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),["アップル", "ストア", "", "iphone", "8", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""],) # fmt: skip
@require_jumanpp
def test_jumanpp_tokenizer_no_normalize(self):
tokenizer = JumanppTokenizer(normalize_text=False)
self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),["", "", "", "", "", "ストア", "", "iPhone", "", "\u3000", "", "\u3000", "\u3000", "\u3000", "発売", "", "れた", "\u3000", ""],) # fmt: skip
@require_jumanpp
def test_jumanpp_tokenizer_trim_whitespace(self):
tokenizer = JumanppTokenizer(trim_whitespace=True)
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "れた", ""],
)
@require_jumanpp
def test_jumanpp_full_tokenizer_with_jumanpp_kwargs_trim_whitespace(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="jumanpp", jumanpp_kwargs={"trim_whitespace": True}
)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
@require_jumanpp
def test_jumanpp_tokenizer_ext(self):
tokenizer = JumanppTokenizer()
self.assertListEqual(
tokenizer.tokenize("ありがとうございますm(_ _)m見つけるのが大変です。"),
["ありがとう", "ございます", "m(_ _)m", "見つける", "", "", "大変です", ""],
)
def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"] # fmt: skip
vocab = {}
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
self.assertListEqual(tokenizer.tokenize(""), [])
self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"])
self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"])
self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"]) # fmt: skip
def test_sentencepiece_tokenizer(self):
tokenizer = BertJapaneseTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp")
subword_tokenizer = tokenizer.subword_tokenizer
tokens = subword_tokenizer.tokenize("国境 の 長い トンネル を 抜ける と 雪国 であった 。")
self.assertListEqual(tokens, ["▁国境", "▁の", "▁長い", "▁トンネル", "▁を", "▁抜ける", "▁と", "▁雪", "", "▁であった", "▁。"]) # fmt: skip
tokens = subword_tokenizer.tokenize("こんばんは こんばん にち は こんにちは")
self.assertListEqual(tokens, ["▁こん", "ばん", "", "▁こん", "ばん", "▁に", "", "▁は", "▁こんにちは"])
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese")
text = tokenizer.encode("ありがとう。", add_special_tokens=False)
text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert encoded_sentence == [2] + text + [3]
assert encoded_pair == [2] + text + [3] + text_2 + [3]
@custom_tokenizers
class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "cl-tohoku/bert-base-japanese"
tokenizer_class = BertJapaneseTokenizer
test_rust_tokenizer = False
@classmethod
def setUpClass(cls):
super().setUpClass()
# Create a separate temp directory for the vocab file to avoid conflicts
# with files saved by the base class setUpClass (e.g., tokenizer_config.json, added_tokens.json)
cls.vocab_tmpdirname = tempfile.mkdtemp()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "", "", "", "", "", "", "", "", "", ""]
cls.vocab_file = os.path.join(cls.vocab_tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
@classmethod
def tearDownClass(cls):
super().tearDownClass()
if hasattr(cls, "vocab_tmpdirname"):
shutil.rmtree(cls.vocab_tmpdirname, ignore_errors=True)
@classmethod
@classmethod
def get_tokenizer(cls, pretrained_name=None, **kwargs):
"""Override to use vocab_tmpdirname instead of tmpdirname to avoid conflicts with saved tokenizer files."""
pretrained_name = pretrained_name or cls.vocab_tmpdirname
return BertJapaneseTokenizer.from_pretrained(pretrained_name, subword_tokenizer_type="character", **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "こんにちは、世界。 \nこんばんは、世界。"
output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
return input_text, output_text
def test_pretokenized_inputs(self):
pass # TODO add if relevant
def test_maximum_encoding_length_pair_input(self):
pass # TODO add if relevant
def test_maximum_encoding_length_single_input(self):
pass # TODO add if relevant
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")
tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。")
self.assertListEqual(tokens, ["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]) # fmt: skip
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12]
)
def test_character_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "", "", "", "", "", "", "", "", "", ""]
vocab = {}
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]")
self.assertListEqual(tokenizer.tokenize(""), [])
self.assertListEqual(tokenizer.tokenize("こんにちは"), ["", "", "", "", ""])
self.assertListEqual(tokenizer.tokenize("こんにちほ"), ["", "", "", "", "[UNK]"])
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese-char")
text = tokenizer.encode("ありがとう。", add_special_tokens=False)
text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert encoded_sentence == [2] + text + [3]
assert encoded_pair == [2] + text + [3] + text_2 + [3]
@custom_tokenizers
class AutoTokenizerCustomTest(unittest.TestCase):
def test_tokenizer_bert_japanese(self):
EXAMPLE_BERT_JAPANESE_ID = "cl-tohoku/bert-base-japanese"
tokenizer = AutoTokenizer.from_pretrained(EXAMPLE_BERT_JAPANESE_ID)
self.assertIsInstance(tokenizer, BertJapaneseTokenizer)

View File

View File

@@ -0,0 +1,66 @@
# Copyright 2018 Salesforce and HuggingFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer
from ...test_tokenization_common import TokenizerTesterMixin
class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "vinai/bertweet-base"
tokenizer_class = BertweetTokenizer
test_rust_tokenizer = False
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["I", "m", "V@@", "R@@", "r", "e@@"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "a m</w>"]
cls.special_tokens_map = {"unk_token": "<unk>"}
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.writelines(f"{token} {vocab_tokens[token]}\n" for token in vocab_tokens)
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
@classmethod
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return BertweetTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "I am VinAI Research"
output_text = "I <unk> m V<unk> <unk> <unk> I Re<unk> e<unk> <unk> <unk> <unk>"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = BertweetTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "I am VinAI Research"
bpe_tokens = "I a@@ m V@@ i@@ n@@ A@@ I R@@ e@@ s@@ e@@ a@@ r@@ c@@ h".split()
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [4, 3, 5, 6, 3, 3, 3, 4, 7, 9, 3, 9, 3, 3, 3, 3, 3]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

View File

View File

@@ -0,0 +1,927 @@
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch BigBird model."""
import unittest
import pytest
from transformers import BigBirdConfig, is_torch_available
from transformers.models.auto import get_values
from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
from transformers.testing_utils import require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
MODEL_FOR_PRETRAINING_MAPPING,
BigBirdForCausalLM,
BigBirdForMaskedLM,
BigBirdForMultipleChoice,
BigBirdForPreTraining,
BigBirdForQuestionAnswering,
BigBirdForSequenceClassification,
BigBirdForTokenClassification,
BigBirdModel,
)
class BigBirdModelTester:
def __init__(
self,
parent,
batch_size=7,
seq_length=128,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu_new",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=256,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
attention_type="block_sparse",
use_bias=True,
rescale_embeddings=False,
block_size=8,
num_rand_blocks=3,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
self.attention_type = attention_type
self.use_bias = use_bias
self.rescale_embeddings = rescale_embeddings
self.block_size = block_size
self.num_rand_blocks = num_rand_blocks
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = self.get_config()
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def get_config(self):
return BigBirdConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
is_encoder_decoder=False,
initializer_range=self.initializer_range,
attention_type=self.attention_type,
use_bias=self.use_bias,
rescale_embeddings=self.rescale_embeddings,
block_size=self.block_size,
num_random_blocks=self.num_rand_blocks,
)
def prepare_config_and_inputs_for_decoder(self):
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = self.prepare_config_and_inputs()
config.is_decoder = True
encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
return (
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
)
def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BigBirdModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
result = model(input_ids, token_type_ids=token_type_ids)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BigBirdForPreTraining(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
labels=token_labels,
next_sentence_label=sequence_labels,
)
self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, config.num_labels))
def create_and_check_model_as_decoder(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
):
config.add_cross_attention = True
model = BigBirdModel(config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
)
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
encoder_hidden_states=encoder_hidden_states,
)
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BigBirdForMaskedLM(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_decoder_model_past_large_inputs(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
):
config.is_decoder = True
config.add_cross_attention = True
model = BigBirdForCausalLM(config=config)
model.to(torch_device)
model.eval()
# first forward pass
outputs = model(
input_ids,
attention_mask=input_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=True,
)
past_key_values = outputs.past_key_values
# create hypothetical multiple next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
output_from_no_past = model(
next_input_ids,
attention_mask=next_attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=False,
output_hidden_states=True,
)["hidden_states"][0]
output_from_past = model(
next_tokens,
attention_mask=next_attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=True,
output_hidden_states=True,
)["hidden_states"][0]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_for_question_answering(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BigBirdForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
start_positions=sequence_labels,
end_positions=sequence_labels,
)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def create_and_check_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = BigBirdForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_token_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = BigBirdForTokenClassification(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def create_and_check_for_multiple_choice(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = BigBirdForMultipleChoice(config=config)
model.to(torch_device)
model.eval()
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
result = model(
multiple_choice_inputs_ids,
attention_mask=multiple_choice_input_mask,
token_type_ids=multiple_choice_token_type_ids,
labels=choice_labels,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
return config, inputs_dict
def create_and_check_for_auto_padding(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
):
model = BigBirdModel(config)
model.to(torch_device)
model.eval()
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_for_change_to_full_attn(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
):
model = BigBirdModel(config)
model.to(torch_device)
model.eval()
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
# the config should not be changed
self.parent.assertTrue(model.config.attention_type == "block_sparse")
@require_torch
class BigBirdModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(
BigBirdModel,
BigBirdForPreTraining,
BigBirdForMaskedLM,
BigBirdForCausalLM,
BigBirdForMultipleChoice,
BigBirdForQuestionAnswering,
BigBirdForSequenceClassification,
BigBirdForTokenClassification,
)
if is_torch_available()
else ()
)
# Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
all_generative_model_classes = ()
pipeline_model_mapping = (
{
"feature-extraction": BigBirdModel,
"fill-mask": BigBirdForMaskedLM,
"text-classification": BigBirdForSequenceClassification,
"text-generation": BigBirdForCausalLM,
"token-classification": BigBirdForTokenClassification,
"zero-shot": BigBirdForSequenceClassification,
}
if is_torch_available()
else {}
)
# special case for ForPreTraining model
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
if return_labels:
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
)
inputs_dict["next_sentence_label"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device
)
return inputs_dict
def setUp(self):
self.model_tester = BigBirdModelTester(self)
self.config_tester = ConfigTester(self, config_class=BigBirdConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_pretraining(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
def test_decoder_model_past_with_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
def test_model_as_decoder(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
def test_model_as_decoder_with_default_input_mask(self):
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
) = self.model_tester.prepare_config_and_inputs_for_decoder()
input_mask = None
self.model_tester.create_and_check_model_as_decoder(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
)
def test_retain_grad_hidden_states_attentions(self):
# bigbird cannot keep gradients in attentions when `attention_type=block_sparse`
if self.model_tester.attention_type == "original_full":
super().test_retain_grad_hidden_states_attentions()
@slow
def test_model_from_pretrained(self):
model_name = "google/bigbird-roberta-base"
model = BigBirdForPreTraining.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_model_various_attn_type(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
for type in ["original_full", "block_sparse"]:
config_and_inputs[0].attention_type = type
self.model_tester.create_and_check_model(*config_and_inputs)
def test_fast_integration(self):
# fmt: off
input_ids = torch.tensor(
[[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 122, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 44, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 98, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73],[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 12, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 28, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 18, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73]], # noqa: E231
dtype=torch.long,
device=torch_device,
)
# fmt: on
input_ids = input_ids % self.model_tester.vocab_size
input_ids[1] = input_ids[1] - 1
attention_mask = torch.ones((input_ids.shape), device=torch_device)
attention_mask[:, :-10] = 0
config, _, _, _, _, _, _ = self.model_tester.prepare_config_and_inputs()
torch.manual_seed(0)
model = BigBirdModel(config).eval().to(torch_device)
with torch.no_grad():
hidden_states = model(input_ids, attention_mask=attention_mask).last_hidden_state
self.assertTrue(
torch.allclose(
hidden_states[0, 0, :5],
torch.tensor([1.4825, 0.0774, 0.8226, -0.2962, -0.9593], device=torch_device),
atol=1e-3,
)
)
def test_auto_padding(self):
self.model_tester.seq_length = 241
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_auto_padding(*config_and_inputs)
def test_for_change_to_full_attn(self):
self.model_tester.seq_length = 9
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_change_to_full_attn(*config_and_inputs)
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing(self):
super().test_training_gradient_checkpointing()
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing_use_reentrant_false(self):
super().test_training_gradient_checkpointing_use_reentrant_false()
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing_use_reentrant_true(self):
super().test_training_gradient_checkpointing_use_reentrant_true()
@require_torch
@slow
class BigBirdModelIntegrationTest(unittest.TestCase):
# we can have this true once block_sparse attn_probs works accurately
test_attention_probs = False
def _get_dummy_input_ids(self):
# fmt: off
ids = torch.tensor(
[[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 122, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 44, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 98, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73]], # noqa: E231
dtype=torch.long,
device=torch_device,
)
# fmt: on
return ids
def test_inference_block_sparse_pretraining(self):
model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base", attention_type="block_sparse")
model.to(torch_device)
input_ids = torch.tensor([[20920, 232, 328, 1437] * 1024], dtype=torch.long, device=torch_device)
with torch.no_grad():
outputs = model(input_ids)
prediction_logits = outputs.prediction_logits
seq_relationship_logits = outputs.seq_relationship_logits
self.assertEqual(prediction_logits.shape, torch.Size((1, 4096, 50358)))
self.assertEqual(seq_relationship_logits.shape, torch.Size((1, 2)))
expected_prediction_logits_slice = torch.tensor(
[
[-0.5583, 0.0475, -0.2508, 7.4423],
[0.7409, 1.4460, -0.7593, 7.7010],
[1.9150, 3.1395, 5.8840, 9.3498],
[-0.1854, -1.4640, -2.2052, 3.7968],
],
device=torch_device,
)
torch.testing.assert_close(
prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, rtol=1e-4, atol=1e-4
)
expected_seq_relationship_logits = torch.tensor([[46.9465, 47.9517]], device=torch_device)
torch.testing.assert_close(seq_relationship_logits, expected_seq_relationship_logits, rtol=1e-4, atol=1e-4)
def test_inference_full_pretraining(self):
model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base", attention_type="original_full")
model.to(torch_device)
input_ids = torch.tensor([[20920, 232, 328, 1437] * 512], dtype=torch.long, device=torch_device)
with torch.no_grad():
outputs = model(input_ids)
prediction_logits = outputs.prediction_logits
seq_relationship_logits = outputs.seq_relationship_logits
self.assertEqual(prediction_logits.shape, torch.Size((1, 512 * 4, 50358)))
self.assertEqual(seq_relationship_logits.shape, torch.Size((1, 2)))
expected_prediction_logits_slice = torch.tensor(
[
[0.1499, -1.1217, 0.1990, 8.4499],
[-2.7757, -3.0687, -4.8577, 7.5156],
[1.5446, 0.1982, 4.3016, 10.4281],
[-1.3705, -4.0130, -3.9629, 5.1526],
],
device=torch_device,
)
torch.testing.assert_close(
prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, rtol=1e-4, atol=1e-4
)
expected_seq_relationship_logits = torch.tensor([[41.4503, 41.2406]], device=torch_device)
torch.testing.assert_close(seq_relationship_logits, expected_seq_relationship_logits, rtol=1e-4, atol=1e-4)
def test_block_sparse_attention_probs(self):
"""
Asserting if outputted attention matrix is similar to hard coded attention matrix
"""
if not self.test_attention_probs:
self.skipTest("test_attention_probs is set to False")
model = BigBirdModel.from_pretrained(
"google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
)
model.to(torch_device)
model.eval()
config = model.config
input_ids = self._get_dummy_input_ids()
hidden_states = model.embeddings(input_ids)
batch_size, seqlen, _ = hidden_states.size()
attn_mask = torch.ones(batch_size, seqlen, device=torch_device, dtype=torch.float)
to_seq_length = from_seq_length = seqlen
from_block_size = to_block_size = config.block_size
blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn(
attn_mask, config.block_size
)
from_blocked_mask = to_blocked_mask = blocked_mask
for i in range(config.num_hidden_layers):
pointer = model.encoder.layer[i].attention.self
query_layer = pointer.transpose_for_scores(pointer.query(hidden_states))
key_layer = pointer.transpose_for_scores(pointer.key(hidden_states))
value_layer = pointer.transpose_for_scores(pointer.value(hidden_states))
context_layer, attention_probs = pointer.bigbird_block_sparse_attention(
query_layer,
key_layer,
value_layer,
band_mask,
from_mask,
to_mask,
from_blocked_mask,
to_blocked_mask,
pointer.num_attention_heads,
pointer.num_random_blocks,
pointer.attention_head_size,
from_block_size,
to_block_size,
batch_size,
from_seq_length,
to_seq_length,
seed=pointer.seed,
plan_from_length=None,
plan_num_rand_blocks=None,
output_attentions=True,
)
context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
cl = torch.einsum("bhqk,bhkd->bhqd", attention_probs, value_layer)
cl = cl.view(context_layer.size())
torch.testing.assert_close(context_layer, cl, rtol=0.001, atol=0.001)
def test_block_sparse_context_layer(self):
model = BigBirdModel.from_pretrained(
"google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
)
model.to(torch_device)
model.eval()
config = model.config
input_ids = self._get_dummy_input_ids()
dummy_hidden_states = model.embeddings(input_ids)
attn_mask = torch.ones_like(input_ids, device=torch_device)
blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn(
attn_mask, config.block_size
)
targeted_cl = torch.tensor(
[
[0.1870, 1.5248, 0.2333, -0.0483, -0.0952, 1.8359, -0.0142, 0.1239, 0.0083, -0.0045],
[-0.0601, 0.1243, 0.1329, -0.1524, 0.2347, 0.0894, -0.2248, -0.2461, -0.0645, -0.0109],
[-0.0418, 0.1463, 0.1290, -0.1638, 0.2489, 0.0799, -0.2341, -0.2406, -0.0524, 0.0106],
[0.1859, 1.5182, 0.2324, -0.0473, -0.0952, 1.8295, -0.0148, 0.1242, 0.0080, -0.0045],
[0.1879, 1.5300, 0.2334, -0.0480, -0.0967, 1.8428, -0.0137, 0.1256, 0.0087, -0.0050],
[0.1852, 1.5149, 0.2330, -0.0492, -0.0936, 1.8236, -0.0154, 0.1210, 0.0080, -0.0048],
[0.1857, 1.5186, 0.2331, -0.0484, -0.0940, 1.8285, -0.0148, 0.1224, 0.0077, -0.0045],
[0.1884, 1.5336, 0.2334, -0.0469, -0.0974, 1.8477, -0.0132, 0.1266, 0.0085, -0.0046],
[0.1881, 1.5308, 0.2334, -0.0479, -0.0969, 1.8438, -0.0136, 0.1258, 0.0088, -0.0050],
[0.1849, 1.5143, 0.2329, -0.0491, -0.0930, 1.8230, -0.0156, 0.1209, 0.0074, -0.0047],
[0.1878, 1.5299, 0.2333, -0.0472, -0.0967, 1.8434, -0.0137, 0.1257, 0.0084, -0.0048],
[0.1873, 1.5260, 0.2333, -0.0478, -0.0961, 1.8383, -0.0142, 0.1245, 0.0083, -0.0048],
[0.1849, 1.5145, 0.2327, -0.0491, -0.0935, 1.8237, -0.0156, 0.1215, 0.0083, -0.0046],
[0.1866, 1.5232, 0.2332, -0.0488, -0.0950, 1.8342, -0.0143, 0.1237, 0.0084, -0.0047],
],
device=torch_device,
)
context_layer = model.encoder.layer[0].attention.self(
dummy_hidden_states,
band_mask=band_mask,
from_mask=from_mask,
to_mask=to_mask,
from_blocked_mask=blocked_mask,
to_blocked_mask=blocked_mask,
)
context_layer = context_layer[0]
self.assertEqual(context_layer.shape, torch.Size((1, 128, 768)))
torch.testing.assert_close(context_layer[0, 64:78, 300:310], targeted_cl, rtol=0.0001, atol=0.0001)
def test_tokenizer_inference(self):
tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
model = BigBirdModel.from_pretrained(
"google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
)
model.to(torch_device)
text = [
"Transformer-based models are unable to process long sequences due to their self-attention operation,"
" which scales quadratically with the sequence length. To address this limitation, we introduce the"
" Longformer with an attention mechanism that scales linearly with sequence length, making it easy to"
" process documents of thousands of tokens or longer. Longformers attention mechanism is a drop-in"
" replacement for the standard self-attention and combines a local windowed attention with a task"
" motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer"
" on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In"
" contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream"
" tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new"
" state-of-the-art results on WikiHop and TriviaQA."
]
inputs = tokenizer(text)
for k in inputs:
inputs[k] = torch.tensor(inputs[k], device=torch_device, dtype=torch.long)
prediction = model(**inputs)
prediction = prediction[0]
self.assertEqual(prediction.shape, torch.Size((1, 199, 768)))
expected_prediction = torch.tensor(
[
[0.1887, -0.0474, 0.2604, 0.1453],
[0.0651, 0.1999, 0.1797, 0.1161],
[0.2833, -0.3036, 0.6910, 0.1123],
[0.2836, -0.4644, -0.0111, 0.1530],
[0.3919, -0.2823, 0.4192, 0.1687],
[0.2168, -0.1956, 0.4050, 0.0925],
[0.2597, -0.0884, 0.1258, 0.1119],
[0.1127, -0.1203, 0.1924, 0.2859],
[0.1362, -0.1315, 0.2693, 0.1027],
[-0.3169, -0.2266, 0.4419, 0.6740],
[0.2366, -0.1452, 0.2589, 0.0579],
[0.0358, -0.2021, 0.3112, -0.1392],
],
device=torch_device,
)
torch.testing.assert_close(prediction[0, 52:64, 320:324], expected_prediction, rtol=1e-4, atol=1e-4)
def test_inference_question_answering(self):
tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-base-trivia-itc")
model = BigBirdForQuestionAnswering.from_pretrained(
"google/bigbird-base-trivia-itc", attention_type="block_sparse", block_size=16, num_random_blocks=3
)
model.to(torch_device)
context = (
"The BigBird model was proposed in Big Bird: Transformers for Longer Sequences by Zaheer, Manzil and"
" Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago"
" and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a"
" sparse-attention based transformer which extends Transformer based models, such as BERT to much longer"
" sequences. In addition to sparse attention, BigBird also applies global attention as well as random"
" attention to the input sequence. Theoretically, it has been shown that applying sparse, global, and"
" random attention approximates full attention, while being computationally much more efficient for longer"
" sequences. As a consequence of the capability to handle longer context, BigBird has shown improved"
" performance on various long document NLP tasks, such as question answering and summarization, compared"
" to BERT or RoBERTa."
)
question = [
"Which is better for longer sequences- BigBird or BERT?",
"What is the benefit of using BigBird over BERT?",
]
inputs = tokenizer(
question,
[context, context],
padding=True,
return_tensors="pt",
add_special_tokens=True,
max_length=256,
truncation=True,
)
inputs = {k: v.to(torch_device) for k, v in inputs.items()}
start_logits, end_logits = model(**inputs).to_tuple()
# fmt: off
target_start_logits = torch.tensor(
[[-8.5622, -9.6209, -14.3351, -8.7032, -11.8596, -7.7446, -9.6730, -13.6063, -8.9651, -11.7417, -8.2641, -8.7056, -13.4116, -5.6600, -8.8316, -10.4148, -12.2180, -7.7979, -12.5274, -6.0685, -10.3373, -11.3128, -6.6456, -14.4030, -6.8292, -14.5383, -11.5638, -6.3326, 11.5293, -1.8434, -10.0013, -7.6150], [-10.7384, -13.1179, -10.1837, -13.7700, -10.0186, -11.7335, -13.3411, -10.0188, -13.4235, -9.9381, -10.4252, -13.1281, -8.2022, -10.4326, -11.5542, -14.1549, -10.7546, -13.4691, -8.2744, -11.4324, -13.3773, -9.8284, -14.5825, -8.7471, -14.7050, -8.0364, -11.3627, -6.4638, -11.7031, -14.3446, -9.9425, -8.0088]], # noqa: E231
device=torch_device,
)
target_end_logits = torch.tensor(
[[-12.1736, -8.8487, -14.8877, -11.6713, -15.1165, -12.2396, -7.6828, -15.4153, -12.2528, -14.3671, -12.3596, -7.4272, -14.9615, -13.6356, -11.7939, -9.9767, -14.8112, -8.9567, -15.8798, -11.5291, -9.4249, -14.7544, -7.9387, -16.2789, -8.9702, -15.3111, -11.5585, -7.9992, -4.1127, 10.3209, -8.3926, -10.2005], [-11.1375, -15.4027, -12.6861, -16.9884, -13.7093, -10.3560, -15.7228, -12.9290, -15.8519, -13.7953, -10.2460, -15.7198, -14.2078, -12.8477, -11.4861, -16.1017, -11.8900, -16.4488, -13.2959, -10.3980, -15.4874, -10.3539, -16.8263, -10.9973, -17.0344, -9.2751, -10.1196, -13.8907, -12.1025, -13.0628, -12.8530, -13.8173]],
device=torch_device,
)
# fmt: on
torch.testing.assert_close(start_logits[:, 64:96], target_start_logits, rtol=1e-4, atol=1e-4)
torch.testing.assert_close(end_logits[:, 64:96], target_end_logits, rtol=1e-4, atol=1e-4)
input_ids = inputs["input_ids"].tolist()
answer = [
input_ids[i][torch.argmax(start_logits, dim=-1)[i] : torch.argmax(end_logits, dim=-1)[i] + 1]
for i in range(len(input_ids))
]
answer = tokenizer.batch_decode(answer)
self.assertTrue(answer == ["BigBird", "global attention"])
def test_fill_mask(self):
tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base")
model.to(torch_device)
input_ids = tokenizer("The goal of life is [MASK] .", return_tensors="pt").input_ids.to(torch_device)
logits = model(input_ids).logits
# [MASK] is token at 6th position
pred_token = tokenizer.decode(torch.argmax(logits[0, 6:7], axis=-1))
self.assertEqual(pred_token, "happiness")
def test_auto_padding(self):
model = BigBirdModel.from_pretrained(
"google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
)
model.to(torch_device)
model.eval()
input_ids = torch.tensor([200 * [10] + 40 * [2] + [1]], device=torch_device, dtype=torch.long)
with torch.no_grad():
output = model(input_ids).to_tuple()[0]
# fmt: off
target = torch.tensor(
[[-0.129420, -0.164740, 0.042422, -0.336030, 0.094379, 0.033794, 0.384590, 0.229660, -0.196500, 0.108020], [-0.000154, -0.168800, 0.165820, -0.313670, 0.101240, 0.035145, 0.381880, 0.213730, -0.201080, 0.077443], [0.053754, -0.166350, 0.225520, -0.272900, 0.119670, 0.019987, 0.348670, 0.199190, -0.181600, 0.084640], [0.063636, -0.187110, 0.237010, -0.297380, 0.126300, 0.020025, 0.268490, 0.191820, -0.192300, 0.035077], [0.073893, -0.184790, 0.188870, -0.297860, 0.134280, 0.028972, 0.174650, 0.186890, -0.180530, 0.006851], [0.005253, -0.169360, 0.123100, -0.302550, 0.126930, 0.024188, 0.133410, 0.200600, -0.168210, -0.001006], [-0.093336, -0.175370, -0.004768, -0.333170, 0.114330, 0.034168, 0.120960, 0.203570, -0.162810, -0.005757], [-0.160210, -0.169310, -0.049064, -0.331950, 0.115730, 0.027062, 0.143600, 0.205310, -0.144580, 0.026746], [-0.193200, -0.156820, -0.079422, -0.351600, 0.106450, 0.032174, 0.245690, 0.210250, -0.173480, 0.043914], [-0.167980, -0.153050, -0.059764, -0.357890,0.103910, 0.031481, 0.334190, 0.208960,-0.178180, 0.072165], [-0.136990, -0.156950, -0.012099, -0.353140,0.096996, 0.025864, 0.376340, 0.216050, -0.171820, 0.089963], [-0.041143, -0.167060, 0.079754, -0.353220, 0.093247, 0.019867, 0.385810, 0.214340, -0.191800, 0.065946],[0.040373, -0.158610, 0.152570, -0.312930, 0.110590, 0.012282, 0.345270, 0.204040, -0.176500, 0.064972], [0.043762, -0.166450, 0.179500, -0.317930, 0.117280, -0.004040, 0.304490, 0.201380, -0.182780, 0.044000]], # noqa: E231
device=torch_device,
)
# fmt: on
self.assertEqual(output.shape, torch.Size((1, 241, 768)))
torch.testing.assert_close(output[0, 64:78, 300:310], target, rtol=0.0001, atol=0.0001)

View File

@@ -0,0 +1,37 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import BigBirdTokenizer
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
SPIECE_UNDERLINE = ""
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/bigbird-roberta-base"
tokenizer_class = BigBirdTokenizer
integration_expected_tokens = ['▁This', '▁is', '▁a', '▁test', '', '😊\n', 'I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '\n生活的真谛是\n', 'Hi', '▁Hello', '\n', 'Hi', '▁Hello', '\n\n', '', '\n', '', '\n', '▁Hello', '\n', '<s>', '', '\n', 'hi', '<s>', '▁there', '\n', 'The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '\n', 'But', '', 'ird', '▁and', '', 'ปี', '', 'ird', '', '\n', 'Hey', '▁how', '▁are', '▁you', '▁doing'] # fmt: skip
integration_expected_token_ids = [871, 419, 358, 1433, 321, 100, 141, 474, 4743, 388, 961, 11125, 112, 391, 529, 419, 27908, 266, 114, 100, 17351, 18536, 100, 17351, 18536, 100, 321, 100, 321, 100, 18536, 100, 2, 321, 100, 5404, 2, 713, 100, 565, 1809, 4832, 916, 408, 6206, 30341, 126, 18536, 114, 100, 1638, 321, 1548, 391, 321, 100, 321, 1548, 321, 100, 10915, 804, 490, 446, 1905] # fmt: skip
expected_tokens_from_ids = ['▁This', '▁is', '▁a', '▁test', '', '<unk>', 'I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '<unk>', 'Hi', '▁Hello', '<unk>', 'Hi', '▁Hello', '<unk>', '', '<unk>', '', '<unk>', '▁Hello', '<unk>', '<s>', '', '<unk>', 'hi', '<s>', '▁there', '<unk>', 'The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '<unk>', 'But', '', 'ird', '▁and', '', '<unk>', '', 'ird', '', '<unk>', 'Hey', '▁how', '▁are', '▁you', '▁doing'] # fmt: skip
integration_expected_decoded_text = "This is a test <unk>I was born in 92000, and this is falsé.<unk>Hi Hello<unk>Hi Hello<unk> <unk> <unk> Hello<unk><s> <unk>hi<s> there<unk>The following string should be properly encoded: Hello.<unk>But ird and <unk> ird <unk>Hey how are you doing"

View File

File diff suppressed because one or more lines are too long

View File

View File

@@ -0,0 +1,433 @@
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch BioGPT model."""
import math
import unittest
from transformers import BioGptConfig, is_sacremoses_available, is_torch_available
from transformers.testing_utils import require_torch, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
BioGptForCausalLM,
BioGptForSequenceClassification,
BioGptForTokenClassification,
BioGptModel,
BioGptTokenizer,
)
class BioGptModelTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=False,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = self.get_config()
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def get_config(self):
return BioGptConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
is_decoder=False,
initializer_range=self.initializer_range,
)
def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = BioGptModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_biogpt_model_attention_mask_past(self, config, input_ids, input_mask, token_type_ids, *args):
model = BioGptModel(config=config)
model.to(torch_device)
model.eval()
# create attention mask
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
half_seq_length = self.seq_length // 2
attn_mask[:, half_seq_length:] = 0
# first forward pass
output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# change a random masked slice from input_ids
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
# append to next input_ids and attn_mask
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
attn_mask = torch.cat(
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
dim=1,
)
# get two different outputs
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_biogpt_model_past_large_inputs(self, config, input_ids, input_mask, token_type_ids, *args):
model = BioGptModel(config=config).to(torch_device).eval()
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
# first forward pass
outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
output, past_key_values = outputs.to_tuple()
# create hypothetical multiple next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_attn_mask = ids_tensor((self.batch_size, 3), 2)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
"last_hidden_state"
]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_forward_and_backwards(
self, config, input_ids, input_mask, token_type_ids, *args, gradient_checkpointing=False
):
model = BioGptForCausalLM(config)
model.to(torch_device)
if gradient_checkpointing:
model.gradient_checkpointing_enable()
result = model(input_ids, labels=input_ids)
self.parent.assertEqual(result.loss.shape, ())
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
result.loss.backward()
def create_and_check_biogpt_weight_initialization(self, config, *args):
model = BioGptModel(config)
model_std = model.config.initializer_range / math.sqrt(2 * model.config.num_hidden_layers)
for key in model.state_dict():
if "c_proj" in key and "weight" in key:
self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
def create_and_check_biogpt_for_token_classification(self, config, input_ids, input_mask, token_type_ids, *args):
config.num_labels = self.num_labels
model = BioGptForTokenClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(BioGptModel, BioGptForCausalLM, BioGptForSequenceClassification, BioGptForTokenClassification)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{
"feature-extraction": BioGptModel,
"text-classification": BioGptForSequenceClassification,
"text-generation": BioGptForCausalLM,
"token-classification": BioGptForTokenClassification,
"zero-shot": BioGptForSequenceClassification,
}
if is_torch_available() and is_sacremoses_available()
else {}
)
def setUp(self):
self.model_tester = BioGptModelTester(self)
self.config_tester = ConfigTester(self, config_class=BioGptConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_biogpt_model_att_mask_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_biogpt_model_attention_mask_past(*config_and_inputs)
def test_biogpt_gradient_checkpointing(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
def test_biogpt_model_past_with_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_biogpt_model_past_large_inputs(*config_and_inputs)
def test_biogpt_weight_initialization(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_biogpt_weight_initialization(*config_and_inputs)
def test_biogpt_token_classification_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_biogpt_for_token_classification(*config_and_inputs)
@slow
def test_batch_generation(self):
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
model.to(torch_device)
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
model.generation_config.pad_token_id = model.generation_config.eos_token_id
# use different length sentences to test batching
sentences = [
"Hello, my dog is a little",
"Today, I",
]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
input_ids = inputs["input_ids"].to(torch_device)
outputs = model.generate(
input_ids=input_ids,
attention_mask=inputs["attention_mask"].to(torch_device),
max_new_tokens=10,
)
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=10)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
# 20 is the default max_length in the generation config
output_padded = model.generate(input_ids=inputs_padded, max_length=20 - num_paddings)
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
expected_output_sentence = [
"Hello, my dog is a little bit bigger than a little bit.",
"Today, I have a good idea of how to use the information",
]
self.assertListEqual(expected_output_sentence, batch_out_sentence)
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
@slow
def test_model_from_pretrained(self):
model_name = "microsoft/biogpt"
model = BioGptModel.from_pretrained(model_name)
self.assertIsNotNone(model)
# Copied from tests.models.opt.test_modeling_opt.OPTModelTest.test_opt_sequence_classification_model with OPT->BioGpt,opt->biogpt,prepare_config_and_inputs->prepare_config_and_inputs_for_common
def test_biogpt_sequence_classification_model(self):
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.num_labels = 3
input_ids = input_dict["input_ids"]
attention_mask = input_ids.ne(1).to(torch_device)
sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
model = BioGptForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
# Copied from tests.models.opt.test_modeling_opt.OPTModelTest.test_opt_sequence_classification_model_for_multi_label with OPT->BioGpt,opt->biogpt,prepare_config_and_inputs->prepare_config_and_inputs_for_common
def test_biogpt_sequence_classification_model_for_multi_label(self):
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.num_labels = 3
config.problem_type = "multi_label_classification"
input_ids = input_dict["input_ids"]
attention_mask = input_ids.ne(1).to(torch_device)
sequence_labels = ids_tensor(
[self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
).to(torch.float)
model = BioGptForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
@require_torch
class BioGptModelIntegrationTest(unittest.TestCase):
@slow
def test_inference_lm_head_model(self):
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
input_ids = torch.tensor([[2, 4805, 9, 656, 21]])
output = model(input_ids)[0]
vocab_size = 42384
expected_shape = torch.Size((1, 5, vocab_size))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[[-9.5236, -9.8918, 10.4557], [-11.0469, -9.6423, 8.1022], [-8.8664, -7.8826, 5.5325]]]
)
torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
@slow
def test_biogpt_generation_beam_search(self):
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
model.to(torch_device)
torch.manual_seed(0)
tokenized = tokenizer("COVID-19 is", return_tensors="pt").to(torch_device)
output_ids = model.generate(
**tokenized,
min_length=100,
max_length=1024,
num_beams=5,
early_stopping=True,
)
output_str = tokenizer.decode(output_ids[0])
EXPECTED_OUTPUT_STR = (
"</s>"
"COVID-19 is a global pandemic caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), the"
" causative agent of coronavirus disease 2019 (COVID-19), which has spread to more than 200 countries and"
" territories, including the United States (US), Canada, Australia, New Zealand, the United Kingdom (UK),"
" and the United States of America (USA), as of March 11, 2020, with more than 800,000 confirmed cases and"
" more than 800,000 deaths. "
"</s>"
)
self.assertEqual(output_str, EXPECTED_OUTPUT_STR)

View File

@@ -0,0 +1,132 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES, BioGptTokenizer
from transformers.testing_utils import require_sacremoses, slow
from ...test_tokenization_common import TokenizerTesterMixin
@require_sacremoses
class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/biogpt"
tokenizer_class = BioGptTokenizer
test_rust_tokenizer = False
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
cls.vocab = [
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"w</w>",
"r</w>",
"t</w>",
"lo",
"low",
"er</w>",
"low</w>",
"lowest</w>",
"newer</w>",
"wider</w>",
"<unk>",
]
cls.merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
return input_text, output_text
def test_full_tokenizer(self):
"""Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt"""
vocab = [
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"w</w>",
"r</w>",
"t</w>",
"lo",
"low",
"er</w>",
"low</w>",
"lowest</w>",
"newer</w>",
"wider</w>",
"<unk>",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
with tempfile.TemporaryDirectory() as tmpdir:
vocab_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["vocab_file"])
merges_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["merges_file"])
shutil.copy(self.vocab_file, vocab_file)
shutil.copy(self.merges_file, merges_file)
with open(vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(merges_file, "w") as fp:
fp.write("\n".join(merges))
tokenizer = BioGptTokenizer(vocab_file, merges_file)
text = "lower"
bpe_tokens = ["low", "er</w>"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
@slow
def test_sequence_builders(self):
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
text = tokenizer.encode("sequence builders", add_special_tokens=False)
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
self.assertTrue(encoded_sentence == [2] + text)
self.assertTrue(encoded_pair == [2] + text + [2] + text_2)

View File

View File

@@ -0,0 +1,116 @@
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_torch, require_vision
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
class BitImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_center_crop=True,
crop_size=None,
do_normalize=True,
image_mean=[0.48145466, 0.4578275, 0.40821073],
image_std=[0.26862954, 0.26130258, 0.27577711],
do_convert_rgb=True,
):
super().__init__()
size = size if size is not None else {"shortest_edge": 20}
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_center_crop": self.do_center_crop,
"crop_size": self.crop_size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.crop_size["height"], self.crop_size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class BitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = BitImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_center_crop"))
self.assertTrue(hasattr(image_processing, "crop_size"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
def test_image_processor_from_dict_with_kwargs(self):
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"shortest_edge": 20})
self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
self.assertEqual(image_processor.size, {"shortest_edge": 42})
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})

View File

@@ -0,0 +1,291 @@
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Bit model."""
import unittest
from functools import cached_property
from transformers import BitConfig
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
from ...test_backbone_common import BackboneTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import BitBackbone, BitForImageClassification, BitImageProcessorPil, BitModel
if is_vision_available():
from PIL import Image
class BitModelTester:
def __init__(
self,
parent,
batch_size=3,
image_size=32,
num_channels=3,
embeddings_size=10,
hidden_sizes=[8, 16, 32, 64],
depths=[1, 1, 2, 1],
is_training=True,
use_labels=True,
hidden_act="relu",
num_labels=3,
scope=None,
out_features=["stage2", "stage3", "stage4"],
out_indices=[2, 3, 4],
num_groups=1,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.num_channels = num_channels
self.embeddings_size = embeddings_size
self.hidden_sizes = hidden_sizes
self.depths = depths
self.is_training = is_training
self.use_labels = use_labels
self.hidden_act = hidden_act
self.num_labels = num_labels
self.scope = scope
self.num_stages = len(hidden_sizes)
self.out_features = out_features
self.out_indices = out_indices
self.num_groups = num_groups
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.num_labels)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return BitConfig(
num_channels=self.num_channels,
embeddings_size=self.embeddings_size,
hidden_sizes=self.hidden_sizes,
depths=self.depths,
hidden_act=self.hidden_act,
num_labels=self.num_labels,
out_features=self.out_features,
out_indices=self.out_indices,
num_groups=self.num_groups,
)
def create_and_check_model(self, config, pixel_values, labels):
model = BitModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(
result.last_hidden_state.shape,
(self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
)
def create_and_check_for_image_classification(self, config, pixel_values, labels):
config.num_labels = self.num_labels
model = BitForImageClassification(config)
model.to(torch_device)
model.eval()
result = model(pixel_values, labels=labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_backbone(self, config, pixel_values, labels):
model = BitBackbone(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
# verify feature maps
self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
# verify channels
self.parent.assertEqual(len(model.channels), len(config.out_features))
self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
# verify backbone works with out_features=None
config.out_features = None
print(config)
model = BitBackbone(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
# verify feature maps
self.parent.assertEqual(len(result.feature_maps), 1)
self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
# verify channels
self.parent.assertEqual(len(model.channels), 1)
self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, labels = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class BitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as Bit does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (BitModel, BitForImageClassification, BitBackbone) if is_torch_available() else ()
pipeline_model_mapping = (
{"image-feature-extraction": BitModel, "image-classification": BitForImageClassification}
if is_torch_available()
else {}
)
test_resize_embeddings = False
has_attentions = False
def setUp(self):
self.model_tester = BitModelTester(self)
self.config_tester = ConfigTester(
self, config_class=BitConfig, has_text_modality=False, common_properties=["num_channels"]
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="Bit does not output attentions")
def test_attention_outputs(self):
pass
@unittest.skip(reason="Bit does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="Bit does not support input and output embeddings")
def test_model_get_set_embeddings(self):
pass
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_backbone(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_backbone(*config_and_inputs)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
expected_num_stages = self.model_tester.num_stages
self.assertEqual(len(hidden_states), expected_num_stages + 1)
# Bit's feature maps are of shape (batch_size, num_channels, height, width)
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[self.model_tester.image_size // 4, self.model_tester.image_size // 4],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
layers_type = ["preactivation", "bottleneck"]
for model_class in self.all_model_classes:
for layer_type in layers_type:
config.layer_type = layer_type
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
@unittest.skip(reason="Bit does not use feedforward chunking")
def test_feed_forward_chunking(self):
pass
def test_for_image_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
model_name = "google/bit-50"
model = BitModel.from_pretrained(model_name)
self.assertIsNotNone(model)
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_torch
@require_vision
class BitModelIntegrationTest(unittest.TestCase):
@cached_property
def default_image_processor(self):
return BitImageProcessorPil.from_pretrained("google/bit-50") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
model = BitForImageClassification.from_pretrained("google/bit-50").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 1000))
self.assertEqual(outputs.logits.shape, expected_shape)
expected_slice = torch.tensor([[-0.6526, -0.5263, -1.4398]]).to(torch_device)
torch.testing.assert_close(outputs.logits[:, :3], expected_slice, rtol=1e-3, atol=1e-3)
@require_torch
class BitBackboneTest(BackboneTesterMixin, unittest.TestCase):
all_model_classes = (BitBackbone,) if is_torch_available() else ()
config_class = BitConfig
has_attentions = False
def setUp(self):
self.model_tester = BitModelTester(self)

View File

View File

@@ -0,0 +1,226 @@
# Copyright 2025 The BitNet team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch BitNet model."""
import gc
import unittest
from transformers import AutoTokenizer, BitNetConfig, is_torch_available
from transformers.testing_utils import (
backend_empty_cache,
require_torch,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
BitNetForCausalLM,
BitNetModel,
)
class BitNetModelTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
vocab_size=99,
hidden_size=64,
num_hidden_layers=2,
num_attention_heads=4,
num_key_value_heads=2,
intermediate_size=37,
hidden_act="gelu",
max_position_embeddings=512,
initializer_range=0.02,
pad_token_id=0,
bos_token_id=1,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
config = self.get_config()
return config, input_ids, input_mask
def get_config(self):
return BitNetConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
num_key_value_heads=self.num_key_value_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
pad_token_id=self.pad_token_id,
bos_token_id=self.bos_token_id,
)
def create_and_check_model(self, config, input_ids, input_mask):
model = BitNetModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
input_mask,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class BitNetModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(
BitNetModel,
BitNetForCausalLM,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{
"feature-extraction": BitNetModel,
"text-generation": BitNetForCausalLM,
}
if is_torch_available()
else {}
)
# TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
def is_pipeline_test_to_skip(
self,
pipeline_test_case_name,
config_class,
model_architecture,
tokenizer_name,
image_processor_name,
feature_extractor_name,
processor_name,
):
return True
def setUp(self):
self.model_tester = BitNetModelTester(self)
self.config_tester = ConfigTester(self, config_class=BitNetConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@require_torch
class BitNetIntegrationTest(unittest.TestCase):
@slow
def test_model_logits(self):
input_ids = [128000, 128000, 1502, 25, 2650, 527, 499, 30, 128009, 72803, 25, 220]
model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
with torch.no_grad():
out = model(input_ids).logits.float().cpu()
# Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor(
[
[
-1.8665,
-1.7681,
-1.7043,
3.7446,
2.7730,
4.7133,
0.9768,
-3.5018,
-12.2812,
-8.1477,
-10.2571,
-8.7610,
]
]
)
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, rtol=1e-2, atol=1e-2)
# slicing logits[0, 0, 0:30]
EXPECTED_SLICE = torch.tensor([5.5815, 4.9154, 1.0478, 4.3869, 3.0112, 0.8235, 3.8412, 2.9233, 8.1140, 1.9406, 1.7973, 10.5025, 4.7796, 8.5926, 4.5196, 3.1549, 3.2656, 3.2588, 2.7356, 2.6032, 2.1454, 1.5683, 1.3465, 1.5329, 1.1886, 7.7902, 5.9326, 1.4737, 3.3319, 1.6291]) # fmt: skip
torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, rtol=1e-4, atol=1e-4)
del model
backend_empty_cache(torch_device)
gc.collect()
@slow
def test_model_generation(self):
EXPECTED_TEXT_COMPLETION = """User: What is your favourite food?Assistant: As an AI, I don't have personal preferences or the ability to eat food. However, I"""
tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": "What is your favourite food?"}], add_generation_prompt=True, tokenize=False
)
model = BitNetForCausalLM.from_pretrained(
"microsoft/bitnet-b1.58-2B-4T", device_map="auto", dtype=torch.bfloat16
)
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
# greedy generation outputs
generated_ids = model.generate(input_ids, max_new_tokens=20, do_sample=False)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
del model
backend_empty_cache(torch_device)
gc.collect()

View File

View File

@@ -0,0 +1,542 @@
# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Blenderbot model."""
import tempfile
import unittest
from functools import cached_property
from transformers import BlenderbotConfig, is_torch_available
from transformers.testing_utils import (
backend_empty_cache,
require_sentencepiece,
require_tokenizers,
require_torch,
require_torch_fp16,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import BlenderbotForConditionalGeneration, BlenderbotModel, BlenderbotTokenizer
from transformers.models.blenderbot.modeling_blenderbot import (
BlenderbotDecoder,
BlenderbotEncoder,
BlenderbotForCausalLM,
)
def prepare_blenderbot_inputs_dict(
config,
input_ids,
decoder_input_ids,
attention_mask=None,
decoder_attention_mask=None,
):
if attention_mask is None:
attention_mask = input_ids.ne(config.pad_token_id)
if decoder_attention_mask is None:
decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
return {
"input_ids": input_ids,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": attention_mask,
}
class BlenderbotModelTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_labels=False,
vocab_size=99,
hidden_size=16,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=4,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=50,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
3,
)
input_ids[:, -1] = self.eos_token_id # Eos Token
decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
config = self.get_config()
inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
return config, inputs_dict
def get_config(self):
return BlenderbotConfig(
vocab_size=self.vocab_size,
d_model=self.hidden_size,
encoder_layers=self.num_hidden_layers,
decoder_layers=self.num_hidden_layers,
encoder_attention_heads=self.num_attention_heads,
decoder_attention_heads=self.num_attention_heads,
encoder_ffn_dim=self.intermediate_size,
decoder_ffn_dim=self.intermediate_size,
dropout=self.hidden_dropout_prob,
attention_dropout=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
eos_token_id=self.eos_token_id,
bos_token_id=self.bos_token_id,
pad_token_id=self.pad_token_id,
)
def get_pipeline_config(self):
config = self.get_config()
config.max_position_embeddings = 100
config.vocab_size = 300
return config
def prepare_config_and_inputs_for_common(self):
config, inputs_dict = self.prepare_config_and_inputs()
return config, inputs_dict
def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
model = BlenderbotModel(config=config).get_decoder().to(torch_device).eval()
input_ids = inputs_dict["input_ids"]
attention_mask = inputs_dict["attention_mask"]
# first forward pass
outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
output, past_key_values = outputs.to_tuple()
# create hypothetical multiple next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_attn_mask = ids_tensor((self.batch_size, 3), 2)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
"last_hidden_state"
]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def check_encoder_decoder_model_standalone(self, config, inputs_dict):
model = BlenderbotModel(config=config).to(torch_device).eval()
outputs = model(**inputs_dict)
encoder_last_hidden_state = outputs.encoder_last_hidden_state
last_hidden_state = outputs.last_hidden_state
with tempfile.TemporaryDirectory() as tmpdirname:
encoder = model.get_encoder()
encoder.save_pretrained(tmpdirname)
encoder = BlenderbotEncoder.from_pretrained(tmpdirname).to(torch_device)
encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
0
]
self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
with tempfile.TemporaryDirectory() as tmpdirname:
decoder = model.get_decoder()
decoder.save_pretrained(tmpdirname)
decoder = BlenderbotDecoder.from_pretrained(tmpdirname).to(torch_device)
last_hidden_state_2 = decoder(
input_ids=inputs_dict["decoder_input_ids"],
attention_mask=inputs_dict["decoder_attention_mask"],
encoder_hidden_states=encoder_last_hidden_state,
encoder_attention_mask=inputs_dict["attention_mask"],
)[0]
self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
@require_torch
class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (BlenderbotModel, BlenderbotForConditionalGeneration) if is_torch_available() else ()
pipeline_model_mapping = (
{
"feature-extraction": BlenderbotModel,
"text-generation": BlenderbotForCausalLM,
}
if is_torch_available()
else {}
)
is_encoder_decoder = True
test_missing_keys = False
def setUp(self):
self.model_tester = BlenderbotModelTester(self)
self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
def test_config(self):
self.config_tester.run_common_tests()
def test_save_load_strict(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
for model_class in self.all_model_classes:
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
self.assertEqual(info["missing_keys"], set())
def test_decoder_model_past_with_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
def test_encoder_decoder_model_standalone(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
@require_torch_fp16
def test_generate_fp16(self):
config, input_dict = self.model_tester.prepare_config_and_inputs()
input_ids = input_dict["input_ids"]
attention_mask = input_ids.ne(1).to(torch_device)
model = BlenderbotForConditionalGeneration(config).eval().to(torch_device)
model.half()
model.generate(input_ids, attention_mask=attention_mask)
model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
"""If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
if a is None and b is None:
return True
try:
if torch.allclose(a, b, atol=atol):
return True
raise Exception
except Exception:
pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
if a.numel() > 100:
msg = f"tensor values are {pct_different:.1%} percent different."
else:
msg = f"{a} != {b}"
if prefix:
msg = prefix + ": " + msg
raise AssertionError(msg)
@unittest.skipUnless(torch_device != "cpu", "3B test too slow on CPU.")
@require_torch
@require_sentencepiece
@require_tokenizers
class Blenderbot3BIntegrationTests(unittest.TestCase):
ckpt = "facebook/blenderbot-3B"
@cached_property
def tokenizer(self):
return BlenderbotTokenizer.from_pretrained(self.ckpt)
@slow
def test_generation_from_short_input_same_as_parlai_3B(self):
FASTER_GEN_KWARGS = {"num_beams": 1, "early_stopping": True, "min_length": 15, "max_length": 25}
TOK_DECODE_KW = {"skip_special_tokens": True, "clean_up_tokenization_spaces": True}
backend_empty_cache(torch_device)
model = BlenderbotForConditionalGeneration.from_pretrained(self.ckpt).half().to(torch_device)
src_text = ["Sam"]
model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
generated_utterances = model.generate(**model_inputs, **FASTER_GEN_KWARGS)
tgt_text = 'Sam is a great name. It means "sun" in Gaelic.'
generated_txt = self.tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
assert generated_txt[0].strip() == tgt_text
src_text = (
"Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel"
" like i'm going to throw up.\nand why is that?"
)
model_inputs = self.tokenizer([src_text], return_tensors="pt").to(torch_device)
generated_ids = model.generate(**model_inputs, **FASTER_GEN_KWARGS)[0]
reply = self.tokenizer.decode(generated_ids, **TOK_DECODE_KW)
assert reply.strip() == "I think it's because we are so worried about what people think of us."
del model
class BlenderbotStandaloneDecoderModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
d_model=16,
decoder_seq_length=7,
is_training=True,
is_decoder=True,
use_attention_mask=True,
use_cache=False,
use_labels=True,
decoder_start_token_id=2,
decoder_ffn_dim=32,
decoder_layers=2,
encoder_attention_heads=4,
decoder_attention_heads=4,
max_position_embeddings=50,
is_encoder_decoder=False,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.d_model = d_model
self.hidden_size = d_model
self.num_hidden_layers = decoder_layers
self.decoder_layers = decoder_layers
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_attention_heads = encoder_attention_heads
self.decoder_attention_heads = decoder_attention_heads
self.num_attention_heads = decoder_attention_heads
self.eos_token_id = eos_token_id
self.bos_token_id = bos_token_id
self.pad_token_id = pad_token_id
self.decoder_start_token_id = decoder_start_token_id
self.use_cache = use_cache
self.max_position_embeddings = max_position_embeddings
self.is_encoder_decoder = is_encoder_decoder
self.scope = None
self.decoder_key_length = decoder_seq_length
self.base_model_out_len = 2
self.decoder_attention_idx = 1
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
lm_labels = None
if self.use_labels:
lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
config = BlenderbotConfig(
vocab_size=self.vocab_size,
d_model=self.d_model,
decoder_layers=self.decoder_layers,
decoder_ffn_dim=self.decoder_ffn_dim,
encoder_attention_heads=self.encoder_attention_heads,
decoder_attention_heads=self.decoder_attention_heads,
eos_token_id=self.eos_token_id,
bos_token_id=self.bos_token_id,
use_cache=self.use_cache,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
max_position_embeddings=self.max_position_embeddings,
is_encoder_decoder=self.is_encoder_decoder,
)
return (
config,
input_ids,
attention_mask,
lm_labels,
)
def create_and_check_decoder_model_past(
self,
config,
input_ids,
attention_mask,
lm_labels,
):
config.use_cache = True
model = BlenderbotDecoder(config=config).to(torch_device).eval()
# first forward pass
outputs = model(input_ids, use_cache=True)
outputs_use_cache_conf = model(input_ids)
outputs_no_past = model(input_ids, use_cache=False)
self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
past_key_values = outputs["past_key_values"]
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
output_from_no_past = model(next_input_ids)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
def create_and_check_decoder_model_attention_mask_past(
self,
config,
input_ids,
attention_mask,
lm_labels,
):
model = BlenderbotDecoder(config=config).to(torch_device).eval()
# create attention mask
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
half_seq_length = input_ids.shape[-1] // 2
attn_mask[:, half_seq_length:] = 0
# first forward pass
past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
# past_key_values = model(input_ids, use_cache=True)["past_key_values"]
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# change a random masked slice from input_ids
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
# append to next input_ids and attn_mask
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
attn_mask = torch.cat(
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
dim=1,
)
# get two different outputs
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
output_from_past = model(
next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, use_cache=True
)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
attention_mask,
lm_labels,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class BlenderbotStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (BlenderbotDecoder, BlenderbotForCausalLM) if is_torch_available() else ()
is_encoder_decoder = False
def setUp(
self,
):
self.model_tester = BlenderbotStandaloneDecoderModelTester(self, is_training=False)
self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
def test_config(self):
self.config_tester.run_common_tests()
def test_decoder_model_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
def test_decoder_model_attn_mask_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
@unittest.skip(reason="decoder cannot keep gradients")
def test_retain_grad_hidden_states_attentions(self):
return
@unittest.skip(reason="Decoder cannot keep gradients")
def test_flex_attention_with_grads():
return

View File

@@ -0,0 +1,23 @@
import unittest
from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer
from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class BlenderbotTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = ["facebook/blenderbot-3B"]
tokenizer_class = BlenderbotTokenizer
integration_expected_tokens = ['ĠThis', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġf', 'als', 'é', '.', 'Ċ', 'ç', 'Ķ', 'Ł', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', 'ç', 'ľ', 'Ł', 'è', '°', 'Ľ', 'æ', 'ĺ', '¯', 'Ċ', 'H', 'i', 'Ġ', 'ĠHello', 'Ċ', 'H', 'i', 'Ġ', 'Ġ', 'ĠHello', 'Ċ', 'Ċ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '<s>', 'Ġ', 'Ċ', 'hi', '<s>', 'Ġthere', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġenc', 'od', 'ed', ':', 'ĠHello', '.', 'Ċ', 'B', 'ut', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à', '¸', 'Ľ', 'à', '¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à', '¸', 'Ķ', 'Ċ', 'H', 'ey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing'] # fmt: skip
integration_expected_token_ids = [678, 315, 265, 1689, 3417, 240, 206, 48, 372, 3647, 302, 1207, 25, 1694, 19, 298, 381, 315, 284, 1095, 3952, 21, 206, 171, 250, 261, 170, 120, 127, 171, 256, 234, 171, 258, 261, 172, 116, 257, 170, 254, 115, 206, 47, 80, 228, 6950, 206, 47, 80, 228, 228, 6950, 206, 206, 228, 206, 228, 228, 206, 6950, 206, 1, 228, 206, 7417, 1, 505, 206, 2839, 3504, 7884, 636, 310, 3867, 2525, 621, 296, 33, 6950, 21, 206, 41, 329, 228, 1221, 298, 228, 164, 124, 257, 164, 124, 121, 228, 228, 228, 1221, 228, 228, 228, 164, 124, 250, 206, 47, 3110, 544, 366, 304, 929] # fmt: skip
expected_tokens_from_ids = ['ĠThis', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġf', 'als', 'é', '.', 'Ċ', 'ç', 'Ķ', 'Ł', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', 'ç', 'ľ', 'Ł', 'è', '°', 'Ľ', 'æ', 'ĺ', '¯', 'Ċ', 'H', 'i', 'Ġ', 'ĠHello', 'Ċ', 'H', 'i', 'Ġ', 'Ġ', 'ĠHello', 'Ċ', 'Ċ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '<s>', 'Ġ', 'Ċ', 'hi', '<s>', 'Ġthere', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġenc', 'od', 'ed', ':', 'ĠHello', '.', 'Ċ', 'B', 'ut', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à', '¸', 'Ľ', 'à', '¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à', '¸', 'Ķ', 'Ċ', 'H', 'ey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing'] # fmt: skip
integration_expected_decoded_text = " This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi Hello\nHi Hello\n\n \n \n Hello\n<s> \nhi<s> there\nThe following string should be properly encoded: Hello.\nBut ird and ปี ird ด\nHey how are you doing"
def test_pretokenized_inputs(self, *args, **kwargs):
# It's very difficult to mix/test pretokenization with byte-level tokenizers
# The issue is that when you have a sequence with leading spaces, splitting it
# with .split() loses the leading spaces, so the tokenization results differ
pass

View File

@@ -0,0 +1,553 @@
# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch BlenderbotSmall model."""
import tempfile
import unittest
from functools import cached_property
from transformers import BlenderbotSmallConfig, is_torch_available
from transformers.testing_utils import (
require_torch,
require_torch_fp16,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel, BlenderbotSmallTokenizer
from transformers.models.blenderbot_small.modeling_blenderbot_small import (
BlenderbotSmallDecoder,
BlenderbotSmallEncoder,
BlenderbotSmallForCausalLM,
)
def prepare_blenderbot_small_inputs_dict(
config,
input_ids,
decoder_input_ids,
attention_mask=None,
decoder_attention_mask=None,
):
if attention_mask is None:
attention_mask = input_ids.ne(config.pad_token_id)
if decoder_attention_mask is None:
decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
return {
"input_ids": input_ids,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"decoder_attention_mask": attention_mask,
}
class BlenderbotSmallModelTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_labels=False,
vocab_size=99,
hidden_size=16,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=4,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=50,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
3,
)
input_ids[:, -1] = self.eos_token_id # Eos Token
decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
config = self.get_config()
inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
return config, inputs_dict
def get_config(self):
return BlenderbotSmallConfig(
vocab_size=self.vocab_size,
d_model=self.hidden_size,
encoder_layers=self.num_hidden_layers,
decoder_layers=self.num_hidden_layers,
encoder_attention_heads=self.num_attention_heads,
decoder_attention_heads=self.num_attention_heads,
encoder_ffn_dim=self.intermediate_size,
decoder_ffn_dim=self.intermediate_size,
dropout=self.hidden_dropout_prob,
attention_dropout=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
eos_token_id=self.eos_token_id,
bos_token_id=self.bos_token_id,
pad_token_id=self.pad_token_id,
)
def prepare_config_and_inputs_for_common(self):
config, inputs_dict = self.prepare_config_and_inputs()
return config, inputs_dict
def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
model = BlenderbotSmallModel(config=config).get_decoder().to(torch_device).eval()
input_ids = inputs_dict["input_ids"]
attention_mask = inputs_dict["attention_mask"]
# first forward pass
outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
output, past_key_values = outputs.to_tuple()
# create hypothetical multiple next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_attn_mask = ids_tensor((self.batch_size, 3), 2)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
"last_hidden_state"
]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def check_encoder_decoder_model_standalone(self, config, inputs_dict):
model = BlenderbotSmallModel(config=config).to(torch_device).eval()
outputs = model(**inputs_dict)
encoder_last_hidden_state = outputs.encoder_last_hidden_state
last_hidden_state = outputs.last_hidden_state
with tempfile.TemporaryDirectory() as tmpdirname:
encoder = model.get_encoder()
encoder.save_pretrained(tmpdirname)
encoder = BlenderbotSmallEncoder.from_pretrained(tmpdirname).to(torch_device)
encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
0
]
self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
with tempfile.TemporaryDirectory() as tmpdirname:
decoder = model.get_decoder()
decoder.save_pretrained(tmpdirname)
decoder = BlenderbotSmallDecoder.from_pretrained(tmpdirname).to(torch_device)
last_hidden_state_2 = decoder(
input_ids=inputs_dict["decoder_input_ids"],
attention_mask=inputs_dict["decoder_attention_mask"],
encoder_hidden_states=encoder_last_hidden_state,
encoder_attention_mask=inputs_dict["attention_mask"],
)[0]
self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
@require_torch
class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (BlenderbotSmallModel, BlenderbotSmallForConditionalGeneration) if is_torch_available() else ()
pipeline_model_mapping = (
{
"feature-extraction": BlenderbotSmallModel,
"text-generation": BlenderbotSmallForCausalLM,
}
if is_torch_available()
else {}
)
is_encoder_decoder = True
test_missing_keys = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self,
pipeline_test_case_name,
config_class,
model_architecture,
tokenizer_name,
image_processor_name,
feature_extractor_name,
processor_name,
):
return pipeline_test_case_name == "TextGenerationPipelineTests"
def setUp(self):
self.model_tester = BlenderbotSmallModelTester(self)
self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
def test_config(self):
self.config_tester.run_common_tests()
def test_save_load_strict(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs()
for model_class in self.all_model_classes:
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
self.assertEqual(info["missing_keys"], set())
def test_decoder_model_past_with_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
def test_encoder_decoder_model_standalone(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
@require_torch_fp16
def test_generate_fp16(self):
config, input_dict = self.model_tester.prepare_config_and_inputs()
input_ids = input_dict["input_ids"]
attention_mask = input_ids.ne(1).to(torch_device)
model = BlenderbotSmallForConditionalGeneration(config).eval().to(torch_device)
model.half()
model.generate(input_ids, attention_mask=attention_mask)
model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
"""If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
if a is None and b is None:
return True
try:
if torch.allclose(a, b, atol=atol):
return True
raise Exception
except Exception:
pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
if a.numel() > 100:
msg = f"tensor values are {pct_different:.1%} percent different."
else:
msg = f"{a} != {b}"
if prefix:
msg = prefix + ": " + msg
raise AssertionError(msg)
@require_torch
class Blenderbot90MIntegrationTests(unittest.TestCase):
ckpt = "facebook/blenderbot-90M"
@cached_property
def model(self):
model = BlenderbotSmallForConditionalGeneration.from_pretrained(self.ckpt).to(torch_device)
if torch_device == "cuda":
model = model.half()
return model
@cached_property
def tokenizer(self):
return BlenderbotSmallTokenizer.from_pretrained(self.ckpt)
@slow
def test_90_generation_from_long_input(self):
src_text = [
"Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel"
" like i'm going to throw up.\nand why is that?"
]
model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
assert isinstance(self.tokenizer, BlenderbotSmallTokenizer)
generated_ids = self.model.generate(**model_inputs)[0]
reply = self.tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
assert reply in (
"i don't know. i just feel like i'm going to throw up. it's not fun.",
"i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
)
@slow
def test_90_generation_from_short_input(self):
model_inputs = self.tokenizer(["sam"], return_tensors="pt").to(torch_device)
generated_utterances = self.model.generate(**model_inputs)
clean_txt = self.tokenizer.decode(
generated_utterances[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
)
assert clean_txt in (
"have you ever been to a sam club? it's a great club in the south.",
"have you ever heard of sam harris? he's an american singer, songwriter, and actor.",
)
class BlenderbotSmallStandaloneDecoderModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
d_model=16,
decoder_seq_length=7,
is_training=True,
is_decoder=True,
use_attention_mask=True,
use_cache=False,
use_labels=True,
decoder_start_token_id=2,
decoder_ffn_dim=32,
decoder_layers=2,
encoder_attention_heads=4,
decoder_attention_heads=4,
max_position_embeddings=50,
is_encoder_decoder=False,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.d_model = d_model
self.hidden_size = d_model
self.num_hidden_layers = decoder_layers
self.decoder_layers = decoder_layers
self.decoder_ffn_dim = decoder_ffn_dim
self.encoder_attention_heads = encoder_attention_heads
self.decoder_attention_heads = decoder_attention_heads
self.num_attention_heads = decoder_attention_heads
self.eos_token_id = eos_token_id
self.bos_token_id = bos_token_id
self.pad_token_id = pad_token_id
self.decoder_start_token_id = decoder_start_token_id
self.use_cache = use_cache
self.max_position_embeddings = max_position_embeddings
self.is_encoder_decoder = is_encoder_decoder
self.scope = None
self.decoder_key_length = decoder_seq_length
self.base_model_out_len = 2
self.decoder_attention_idx = 1
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
lm_labels = None
if self.use_labels:
lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
config = BlenderbotSmallConfig(
vocab_size=self.vocab_size,
d_model=self.d_model,
decoder_layers=self.decoder_layers,
num_hidden_layers=self.decoder_layers,
decoder_ffn_dim=self.decoder_ffn_dim,
encoder_attention_heads=self.encoder_attention_heads,
decoder_attention_heads=self.decoder_attention_heads,
eos_token_id=self.eos_token_id,
bos_token_id=self.bos_token_id,
use_cache=self.use_cache,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
max_position_embeddings=self.max_position_embeddings,
is_encoder_decoder=self.is_encoder_decoder,
)
return (
config,
input_ids,
attention_mask,
lm_labels,
)
def create_and_check_decoder_model_past(
self,
config,
input_ids,
attention_mask,
lm_labels,
):
config.use_cache = True
model = BlenderbotSmallDecoder(config=config).to(torch_device).eval()
# first forward pass
outputs = model(input_ids, use_cache=True)
outputs_use_cache_conf = model(input_ids)
outputs_no_past = model(input_ids, use_cache=False)
self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
past_key_values = outputs["past_key_values"]
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
output_from_no_past = model(next_input_ids)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
def create_and_check_decoder_model_attention_mask_past(
self,
config,
input_ids,
attention_mask,
lm_labels,
):
model = BlenderbotSmallDecoder(config=config).to(torch_device).eval()
# create attention mask
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
half_seq_length = input_ids.shape[-1] // 2
attn_mask[:, half_seq_length:] = 0
# first forward pass
past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# change a random masked slice from input_ids
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
# append to next input_ids and attn_mask
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
attn_mask = torch.cat(
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
dim=1,
)
# get two different outputs
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
output_from_past = model(
next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, use_cache=True
)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
attention_mask,
lm_labels,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class BlenderbotSmallStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (BlenderbotSmallDecoder, BlenderbotSmallForCausalLM) if is_torch_available() else ()
is_encoder_decoder = False
def setUp(
self,
):
self.model_tester = BlenderbotSmallStandaloneDecoderModelTester(self, is_training=False)
self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
def test_config(self):
self.config_tester.run_common_tests()
def test_decoder_model_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
def test_decoder_model_attn_mask_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
@unittest.skip(reason="decoder cannot keep gradients")
def test_retain_grad_hidden_states_attentions(self):
return
@unittest.skip(reason="Decoder cannot keep gradients")
def test_flex_attention_with_grads():
return

View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python3
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the Blenderbot small tokenizer."""
import json
import os
import shutil
import tempfile
import unittest
from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
VOCAB_FILES_NAMES,
BlenderbotSmallTokenizer,
)
from ...test_tokenization_common import TokenizerTesterMixin
class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/blenderbot_small-90M"
tokenizer_class = BlenderbotSmallTokenizer
test_rust_tokenizer = False
def test_full_blenderbot_small_tokenizer(self):
# Create temporary directory for vocab files
tmpdirname = tempfile.mkdtemp()
try:
vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
tokenizer = BlenderbotSmallTokenizer(vocab_file, merges_file, **special_tokens_map)
text = "adapt act apte"
bpe_tokens = ["adapt", "act", "ap@@", "te"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = [tokenizer.bos_token] + tokens + [tokenizer.eos_token]
input_bpe_tokens = [0, 1, 2, 3, 4, 5]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
finally:
shutil.rmtree(tmpdirname)
def test_special_tokens_small_tok(self):
tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
assert tok("sam").input_ids == [1384]
src_text = "I am a small frog."
encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
assert src_text != decoded # I wish it did!
assert decoded == "i am a small frog ."
def test_empty_word_small_tok(self):
tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
src_text = "I am a small frog ."
src_text_dot = "."
encoded = tok(src_text)["input_ids"]
encoded_dot = tok(src_text_dot)["input_ids"]
assert encoded[-1] == encoded_dot[0]

View File

View File

@@ -0,0 +1,122 @@
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_torch, require_vision
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
class BlipImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
do_pad=False,
image_mean=[0.48145466, 0.4578275, 0.40821073],
image_std=[0.26862954, 0.26130258, 0.27577711],
do_convert_rgb=True,
):
size = size if size is not None else {"height": 20, "width": 20}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_pad = do_pad
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
"do_pad": self.do_pad,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.size["height"], self.size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class BlipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = BlipImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processor, "do_resize"))
self.assertTrue(hasattr(image_processor, "size"))
self.assertTrue(hasattr(image_processor, "do_normalize"))
self.assertTrue(hasattr(image_processor, "image_mean"))
self.assertTrue(hasattr(image_processor, "image_std"))
self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
@require_torch
@require_vision
class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = BlipImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processor, "do_resize"))
self.assertTrue(hasattr(image_processor, "size"))
self.assertTrue(hasattr(image_processor, "do_normalize"))
self.assertTrue(hasattr(image_processor, "image_mean"))
self.assertTrue(hasattr(image_processor, "image_std"))
self.assertTrue(hasattr(image_processor, "do_convert_rgb"))

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,152 @@
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Blip model."""
import unittest
import numpy as np
from transformers import BlipTextConfig
from transformers.testing_utils import require_torch, slow, torch_device
from transformers.utils import is_torch_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
if is_torch_available():
import torch
from transformers import BlipTextModel
class BlipTextModelTester:
def __init__(
self,
parent,
batch_size=12,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
projection_dim=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
max_position_embeddings=512,
initializer_range=0.02,
bos_token_id=0,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.scope = scope
self.bos_token_id = bos_token_id
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if input_mask is not None:
batch_size, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
for batch_idx, start_index in enumerate(rnd_start_indices):
input_mask[batch_idx, :start_index] = 1
input_mask[batch_idx, start_index:] = 0
config = self.get_config()
return config, input_ids, input_mask
def get_config(self):
return BlipTextConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
bos_token_id=self.bos_token_id,
)
def create_and_check_model(self, config, input_ids, input_mask):
model = BlipTextModel(config=config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(input_ids, attention_mask=input_mask)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, input_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (BlipTextModel,) if is_torch_available() else ()
def setUp(self):
self.model_tester = BlipTextModelTester(self)
self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip
def test_training(self):
pass
@unittest.skip(reason="Blip does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@slow
def test_model_from_pretrained(self):
model_name = "Salesforce/blip-vqa-base"
model = BlipTextModel.from_pretrained(model_name)
self.assertIsNotNone(model)

View File

@@ -0,0 +1,33 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_vision
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import BlipProcessor
@require_vision
class BlipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = BlipProcessor
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
return tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-BertModel")

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,42 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_vision
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import Blip2Processor
@require_vision
class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Blip2Processor
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
return tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-GPT2Model")
@classmethod
def _setup_image_processor(cls):
image_processor_class = cls._get_component_class_from_processor("image_processor")
return image_processor_class.from_pretrained("hf-internal-testing/tiny-random-ViTModel")
@staticmethod
def prepare_processor_dict():
return {"num_query_tokens": 1}

View File

View File

@@ -0,0 +1,621 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import math
import unittest
from transformers import is_torch_available
from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
from ...test_modeling_common import ids_tensor
if is_torch_available():
import torch
from transformers import (
AutoTokenizer,
BloomForCausalLM,
BloomModel,
)
@require_torch
class BloomModelTester(CausalLMModelTester):
if is_torch_available():
base_model_class = BloomModel
def create_and_check_bloom_model_past(self, config, *args):
input_ids, _, input_mask, _, _, _ = args
model = BloomModel(config=config)
model.to(torch_device)
model.eval()
# first forward pass
outputs = model(input_ids, attention_mask=torch.ones_like(input_ids), use_cache=True)
outputs_use_cache_conf = model(input_ids, attention_mask=torch.ones_like(input_ids))
outputs_no_past = model(input_ids, use_cache=False, attention_mask=torch.ones_like(input_ids))
self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
past = outputs["past_key_values"]
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# append to next input_ids and token_type_ids
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
output_from_no_past = model(next_input_ids)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_bloom_model_attention_mask_past(self, config, *args):
input_ids, _, input_mask, _, _, _ = args
model = BloomModel(config=config)
model.to(torch_device)
model.eval()
# create attention mask
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
half_seq_length = self.seq_length // 2
attn_mask[:, half_seq_length:] = 0
# first forward pass
output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# change a random masked slice from input_ids
random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
# append to next input_ids and attn_mask
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
attn_mask = torch.cat(
[attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
dim=1,
)
# get two different outputs
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_bloom_model_past_large_inputs(self, config, *args):
input_ids, _, input_mask, _, _, _ = args
model = BloomModel(config=config)
model.to(torch_device)
model.eval()
# first forward pass
outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
output, past = outputs.to_tuple()
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
# append to next input_ids and token_type_ids
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past)[
"last_hidden_state"
]
self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_lm_head_model(self, config, *args):
input_ids, _, input_mask, _, _, _ = args
model = BloomForCausalLM(config)
model.to(torch_device)
model.eval()
result = model(input_ids, labels=input_ids)
self.parent.assertEqual(result.loss.shape, ())
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_bloom_weight_initialization(self, config, *args):
model = BloomModel(config)
model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
for key in model.state_dict():
if "c_proj" in key and "weight" in key:
self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
@require_torch
class BloomModelTest(CausalLMModelTest, unittest.TestCase):
model_tester_class = BloomModelTester
test_missing_keys = False
def test_bloom_model_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bloom_model_past(*config_and_inputs)
def test_bloom_model_att_mask_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bloom_model_attention_mask_past(*config_and_inputs)
def test_bloom_model_past_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bloom_model_past_large_inputs(*config_and_inputs)
def test_bloom_lm_head_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
def test_bloom_weight_initialization(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bloom_weight_initialization(*config_and_inputs)
@unittest.skip("Bloom needs a 2D attention for alibi")
def test_custom_4d_attention_mask(self):
pass
@require_torch
class BloomIntegrationTest(unittest.TestCase):
def setUp(self):
super().setUp()
self.path_bigscience_model = "bigscience/bigscience-small-testing"
@require_torch
def test_embeddings(self):
"""
The goal here is to compare the embeddings generated by the model trained
using Megatron-LM with the one from the transformers library, with a small GPT2-like model
to ensure that the conversion from Megatron-LM to transformers has been done successfully.
The script compares the logits of the embedding layer and the transformer layers.
WARNING: It is expected that these logits will not have exactly the same statistics when running
the code on CPU or GPU. For more info, please visit:
- https://github.com/pytorch/pytorch/issues/76052#issuecomment-1103193548
- https://discuss.pytorch.org/t/reproducibility-issue-between-intel-and-amd-cpus/144779/9
You need to install tokenizers following this readme:
- https://huggingface.co/bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
Tokenizer used during training:
- https://huggingface.co/bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
# TODO change the script (or just add skip) when building the env with tokenizers 0.12.0
"""
# The config in this checkpoint has `bfloat16` as `dtype` -> model in `bfloat16`
model = BloomForCausalLM.from_pretrained(self.path_bigscience_model, use_safetensors=False, dtype="auto")
model.eval()
EMBEDDINGS_DS_BEFORE_LN_BF_16_MEAN = {
3478: 0.0002307891845703125,
368: -0.000568389892578125,
109586: -0.0003910064697265625,
35433: -0.000194549560546875,
2: 0.0004138946533203125,
77: 0.000659942626953125,
132619: -0.00031280517578125,
2175: 0.000457763671875,
23714: 0.000263214111328125,
73173: -0.000286102294921875,
144252: 0.00052642822265625,
}
EMBEDDINGS_DS_BEFORE_LN_BF_16_MIN = {
3478: -0.00921630859375,
368: -0.010009765625,
109586: -0.01031494140625,
35433: -0.01177978515625,
2: -0.0074462890625,
77: -0.00848388671875,
132619: -0.009521484375,
2175: -0.0074462890625,
23714: -0.0145263671875,
73173: -0.007415771484375,
144252: -0.01007080078125,
}
EMBEDDINGS_DS_BEFORE_LN_BF_16_MAX = {
3478: 0.0128173828125,
368: 0.01214599609375,
109586: 0.0111083984375,
35433: 0.01019287109375,
2: 0.0157470703125,
77: 0.0174560546875,
132619: 0.0078125,
2175: 0.0113525390625,
23714: 0.0146484375,
73173: 0.01116943359375,
144252: 0.01141357421875,
}
EMBEDDINGS_DS_BEFORE_LN_BF_16_SUM = {"value": 0.08203125}
EMBEDDINGS_DS_BEFORE_LN_F_16_MEAN = {
132619: -0.00031256675720214844,
3478: 0.00023090839385986328,
368: -0.0005702972412109375,
109586: -0.00039124488830566406,
35433: -0.000194549560546875,
2: 0.0004146099090576172,
2175: 0.0004572868347167969,
23714: 0.00026416778564453125,
73173: -0.0002865791320800781,
144252: 0.0005254745483398438,
77: 0.0006618499755859375,
}
EMBEDDINGS_DS_BEFORE_LN_F_16_MIN = {
3478: -0.00921630859375,
368: -0.010009765625,
109586: -0.01031494140625,
35433: -0.01177978515625,
2: -0.0074462890625,
77: -0.00848388671875,
132619: -0.009521484375,
2175: -0.0074462890625,
23714: -0.0145263671875,
73173: -0.007415771484375,
144252: -0.01007080078125,
}
EMBEDDINGS_DS_BEFORE_LN_F_16_MAX = {
3478: 0.0128173828125,
368: 0.01214599609375,
109586: 0.0111083984375,
35433: 0.01019287109375,
2: 0.0157470703125,
77: 0.0174560546875,
132619: 0.0078125,
2175: 0.0113525390625,
23714: 0.0146484375,
73173: 0.01116943359375,
144252: 0.01141357421875,
}
EMBEDDINGS_DS_BEFORE_LN_F_16_SUM = {"value": 0.0821533203125}
EMBEDDINGS_DS_BEFORE_LN_F_32_MEAN = {
132619: -0.00031267106533050537,
3478: 0.00023087859153747559,
368: -0.0005701072514057159,
109586: -0.0003911703824996948,
35433: -0.0001944899559020996,
2: 0.0004146844148635864,
2175: 0.00045740045607089996,
23714: 0.0002641640603542328,
73173: -0.0002864748239517212,
144252: 0.0005256589502096176,
77: 0.0006617321632802486,
}
EMBEDDINGS_DS_BEFORE_LN_F_32_MIN = {
3478: -0.00921630859375,
368: -0.010009765625,
109586: -0.01031494140625,
35433: -0.01177978515625,
2: -0.0074462890625,
77: -0.00848388671875,
132619: -0.009521484375,
2175: -0.0074462890625,
23714: -0.0145263671875,
73173: -0.007415771484375,
144252: -0.01007080078125,
}
EMBEDDINGS_DS_BEFORE_LN_F_32_MAX = {
3478: 0.0128173828125,
368: 0.01214599609375,
109586: 0.0111083984375,
35433: 0.01019287109375,
2: 0.0157470703125,
77: 0.0174560546875,
132619: 0.0078125,
2175: 0.0113525390625,
23714: 0.0146484375,
73173: 0.01116943359375,
144252: 0.01141357421875,
}
EMBEDDINGS_DS_BEFORE_LN_F_32_SUM = {"value": 0.08217757940292358}
TEST_EMBEDDINGS = {
"torch.bfloat16": {
"mean": EMBEDDINGS_DS_BEFORE_LN_BF_16_MEAN,
"max": EMBEDDINGS_DS_BEFORE_LN_BF_16_MAX,
"min": EMBEDDINGS_DS_BEFORE_LN_BF_16_MIN,
"sum": EMBEDDINGS_DS_BEFORE_LN_BF_16_SUM,
},
"torch.float32": {
"mean": EMBEDDINGS_DS_BEFORE_LN_F_32_MEAN,
"max": EMBEDDINGS_DS_BEFORE_LN_F_32_MAX,
"min": EMBEDDINGS_DS_BEFORE_LN_F_32_MIN,
"sum": EMBEDDINGS_DS_BEFORE_LN_F_32_SUM,
},
"torch.float": {
"mean": EMBEDDINGS_DS_BEFORE_LN_F_32_MEAN,
"max": EMBEDDINGS_DS_BEFORE_LN_F_32_MAX,
"min": EMBEDDINGS_DS_BEFORE_LN_F_32_MIN,
"sum": EMBEDDINGS_DS_BEFORE_LN_F_32_SUM,
},
"torch.float16": {
"mean": EMBEDDINGS_DS_BEFORE_LN_F_16_MEAN,
"max": EMBEDDINGS_DS_BEFORE_LN_F_16_MAX,
"min": EMBEDDINGS_DS_BEFORE_LN_F_16_MIN,
"sum": EMBEDDINGS_DS_BEFORE_LN_F_16_SUM,
},
}
EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478] # fmt: skip
EMBEDDINGS_DS_AFTER_LN_MEAN = {
3478: -6.580352783203125e-05,
368: 0.0001316070556640625,
109586: -0.00030517578125,
35433: 4.00543212890625e-05,
2: -7.2479248046875e-05,
77: -8.96453857421875e-05,
132619: 0.0001583099365234375,
2175: 2.1219253540039062e-05,
23714: -0.000247955322265625,
73173: -0.00021839141845703125,
144252: -0.0001430511474609375,
}
EMBEDDINGS_DS_AFTER_LN_MIN = {
3478: -1.6953125,
368: -1.6875,
109586: -1.6875,
35433: -2.125,
2: -1.390625,
77: -1.5390625,
132619: -1.875,
2175: -1.4609375,
23714: -2.296875,
73173: -1.3515625,
144252: -1.78125,
}
EMBEDDINGS_DS_AFTER_LN_MAX = {
3478: 2.265625,
368: 2.28125,
109586: 1.953125,
35433: 1.90625,
2: 2.703125,
77: 2.828125,
132619: 1.65625,
2175: 2.015625,
23714: 2.234375,
73173: 2.171875,
144252: 1.828125,
}
EMBEDDINGS_DS_AFTER_LN = {
"mean": EMBEDDINGS_DS_AFTER_LN_MEAN,
"min": EMBEDDINGS_DS_AFTER_LN_MIN,
"max": EMBEDDINGS_DS_AFTER_LN_MAX,
}
tensor_ids = torch.LongTensor([EXAMPLE_IDS])
with torch.no_grad():
embeddings = model.transformer.word_embeddings(tensor_ids)
embeddings_ln = model.transformer.word_embeddings_layernorm(embeddings)
# first check the embeddings before LN
output_dict = {"min": {}, "max": {}, "mean": {}, "sum": {"value": embeddings.sum().item()}}
for i, idx in enumerate(EXAMPLE_IDS):
output_dict["min"][idx] = embeddings.min(dim=-1).values[0][i].item()
output_dict["max"][idx] = embeddings.max(dim=-1).values[0][i].item()
output_dict["mean"][idx] = embeddings.mean(dim=-1)[0][i].item()
for key in TEST_EMBEDDINGS[str(model.dtype)]:
self.assertDictEqual(TEST_EMBEDDINGS[str(model.dtype)][key], output_dict[key])
output_dict_norm = {"min": {}, "max": {}, "mean": {}}
for i, idx in enumerate(EXAMPLE_IDS):
output_dict_norm["min"][idx] = embeddings_ln.min(dim=-1).values[0][i].item()
output_dict_norm["max"][idx] = embeddings_ln.max(dim=-1).values[0][i].item()
output_dict_norm["mean"][idx] = embeddings_ln.mean(dim=-1)[0][i].item()
# This test does not pass when places = 2
for i, key in enumerate(output_dict_norm.keys()):
for j, idx in enumerate(output_dict[key].keys()):
self.assertAlmostEqual(EMBEDDINGS_DS_AFTER_LN[key][idx], output_dict_norm[key][idx], places=1)
@require_torch
def test_hidden_states_transformers(self):
model = BloomModel.from_pretrained(
self.path_bigscience_model, use_safetensors=False, use_cache=False, dtype="auto"
).to(torch_device)
model.eval()
EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478] # fmt: skip
MEAN_VALUE_LAST_LM = -4.3392181396484375e-05
MIN_MAX_DICT = {"min": -2.0625, "max": 2.75}
tensor_ids = torch.LongTensor([EXAMPLE_IDS])
with torch.no_grad():
logits = model(tensor_ids.to(torch_device))
output_dict = {
"min": logits.last_hidden_state.min(dim=-1).values[0][0].item(),
"max": logits.last_hidden_state.max(dim=-1).values[0][0].item(),
}
if torch_device == "cuda":
self.assertAlmostEqual(MEAN_VALUE_LAST_LM, logits.last_hidden_state.mean().item(), places=4)
else:
self.assertAlmostEqual(MEAN_VALUE_LAST_LM, logits.last_hidden_state.mean().item(), places=3)
self.assertDictEqual(MIN_MAX_DICT, output_dict)
@require_torch
def test_logits(self):
model = BloomForCausalLM.from_pretrained(
self.path_bigscience_model, use_safetensors=False, use_cache=False, dtype="auto"
).to(torch_device) # load in bf16
model.eval()
EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478] # fmt: skip
MEAN_LOGITS_GPU_1 = -1.823902130126953e-05
MEAN_LOGITS_GPU_2 = 1.9431114196777344e-05
tensor_ids = torch.LongTensor([EXAMPLE_IDS]).to(torch_device)
with torch.no_grad():
output = model(tensor_ids).logits
output_gpu_1, output_gpu_2 = output.split(125440, dim=-1)
self.assertAlmostEqual(output_gpu_1.mean().item(), MEAN_LOGITS_GPU_1, places=6)
self.assertAlmostEqual(output_gpu_2.mean().item(), MEAN_LOGITS_GPU_2, places=6)
@slow
@require_torch_accelerator
def test_simple_generation(self):
# This test is a bit flaky. For some GPU architectures, pytorch sets by default allow_fp16_reduced_precision_reduction = True and some operations
# do not give the same results under this configuration, especially torch.baddmm and torch.bmm. https://pytorch.org/docs/stable/notes/numerical_accuracy.html#fp16-on-mi200
# As we leave the default value (True) for allow_fp16_reduced_precision_reduction, the tests failed when running in half-precision with smaller models (560m)
# Please see: https://pytorch.org/docs/stable/notes/cuda.html#reduced-precision-reduction-in-fp16-gemms
# This discrepancy is observed only when using small models and seems to be stable for larger models.
# Our conclusion is that these operations are flaky for small inputs but seems to be stable for larger inputs (for the functions `baddmm` and `bmm`), and therefore for larger models.
# Here is a summary of an ablation study of our observations
# EXPECTED_OUTPUT = "I enjoy walking with my cute dog, and I love to watch the kids play. I am a very active person, and I am a very good listener. I am a very good person, and I am a very good person. I am a"
# 560m + allow_fp16_reduced_precision_reduction = False + torch.bmm ==> PASS
# 560m + allow_fp16_reduced_precision_reduction = False + torch.baddm ==> PASS
# 560m + allow_fp16_reduced_precision_reduction = True + torch.baddm ==> PASS
# 560m + allow_fp16_reduced_precision_reduction = True + torch.bmm ==> FAIL
# EXPECTED_OUTPUT = "I enjoy walking with my cute dog, but I also enjoy hiking, biking, and swimming. I love to cook and bake. I love to cook and bake. I love to cook and bake. I love to cook and bake. I love"
# >=1b1 + allow_fp16_reduced_precision_reduction = True + torch.baddm ==> PASS (for use_cache=True and use_cache=False)
# >=1b1 + allow_fp16_reduced_precision_reduction = True + torch.bmm ==> PASS
# >=1b1 + allow_fp16_reduced_precision_reduction = False + torch.bmm ==> PASS
path_560m = "bigscience/bloom-560m"
model = BloomForCausalLM.from_pretrained(
path_560m, use_safetensors=False, use_cache=True, revision="gs555750"
).to(torch_device)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(path_560m)
input_sentence = "I enjoy walking with my cute dog"
# This output has been obtained using fp32 model on the huggingface DGX workstation - NVIDIA A100 GPU
EXPECTED_OUTPUT = (
"I enjoy walking with my cute dog, and I love to watch the kids play with the kids. I am a very "
"active person, and I enjoy working out, and I am a very active person. I am a very active person, and I"
)
input_ids = tokenizer.encode(input_sentence, return_tensors="pt")
greedy_output = model.generate(input_ids.to(torch_device), max_length=50)
self.assertEqual(tokenizer.decode(greedy_output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
@slow
@require_torch_accelerator
def test_batch_generation(self):
path_560m = "bigscience/bloom-560m"
model = BloomForCausalLM.from_pretrained(
path_560m, use_safetensors=False, use_cache=True, revision="gs555750"
).to(torch_device)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(path_560m, padding_side="left")
input_sentence = ["I enjoy walking with my cute dog", "I enjoy walking with my cute dog"]
inputs = tokenizer(input_sentence, return_tensors="pt", padding=True)
input_ids = inputs["input_ids"].to(torch_device)
attention_mask = inputs["attention_mask"]
greedy_output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, do_sample=False)
self.assertEqual(
tokenizer.decode(greedy_output[0], skip_special_tokens=True),
tokenizer.decode(greedy_output[1], skip_special_tokens=True),
)
@slow
@require_torch_accelerator
def test_batch_generation_padding(self):
path_560m = "bigscience/bloom-560m"
model = BloomForCausalLM.from_pretrained(
path_560m, use_safetensors=False, use_cache=True, revision="gs555750"
).to(torch_device)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(path_560m, padding_side="left")
input_sentence = ["I enjoy walking with my cute dog", "Hello my name is"]
input_sentence_without_pad = "Hello my name is"
input_ids = tokenizer(input_sentence, return_tensors="pt", padding=True)
input_ids_without_pad = tokenizer.encode(input_sentence_without_pad, return_tensors="pt")
input_ids, attention_mask = input_ids["input_ids"].to(torch_device), input_ids["attention_mask"]
greedy_output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, do_sample=False)
greedy_output_without_pad = model.generate(
input_ids_without_pad.to(torch_device), max_length=50, do_sample=False
)
# test token values
self.assertEqual(greedy_output[-1, 3:].tolist(), greedy_output_without_pad[0, :-3].tolist())
# test reconstructions
self.assertEqual(
tokenizer.decode(greedy_output[-1, 3:], skip_special_tokens=True),
tokenizer.decode(greedy_output_without_pad[0, :-3], skip_special_tokens=True),
)
@slow
@require_torch_accelerator
def test_batch_generated_text(self):
path_560m = "bigscience/bloom-560m"
model = BloomForCausalLM.from_pretrained(
path_560m, use_safetensors=False, use_cache=True, revision="gs555750"
).to(torch_device)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(path_560m, padding_side="left")
input_sentences = [
"Hello what is",
"Running a quick test with the",
]
inputs = tokenizer(input_sentences, return_tensors="pt", padding=True, truncation=True)
generated_ids = model.generate(
inputs["input_ids"].to(torch_device), attention_mask=inputs["attention_mask"], max_length=20
)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
# these generations match those of the PyTorch model
EXPECTED_GENERATIONS = [
"Hello what is the best way to get the data from the server? I have tried",
"Running a quick test with the following command:\nsudo apt-get install python3\nsudo apt-get install python2",
]
self.assertListEqual(generated_text, EXPECTED_GENERATIONS)

View File

@@ -0,0 +1,134 @@
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from datasets import load_dataset
from transformers import TokenizersBackend
from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "bigscience/tokenizer"
slow_tokenizer_class = None
rust_tokenizer_class = TokenizersBackend
tokenizer_class = TokenizersBackend
test_slow_tokenizer = False
from_pretrained_vocab_key = "tokenizer_file"
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
# Integration test data - expected outputs for the default input string
integration_expected_tokens = ['This', 'Ġis', 'Ġa', 'Ġtest', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '', 'çĶŁæ´»çļĦ', '羣', 'è°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', 'Ġ', 'ĠHello', 'Ċ', 'Hi', 'ĠĠ', 'ĠHello', 'ĊĊ', 'ĠĊ', 'ĠĠĊ', 'ĠHello', 'Ċ', '<s>', 'Ċ', 'hi', '<s>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġenc', 'od', 'ed:', 'ĠHello', '', 'But', 'Ġir', 'd', 'Ġand', 'Ġà¸', 'Ľ', 'ี', 'ĠĠ', 'Ġir', 'd', 'ĠĠ', 'Ġà¸', 'Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing'] # fmt: skip
integration_expected_token_ids = [6168, 632, 267, 4006, 189, 44, 1620, 34181, 361, 1575, 14739, 15, 530, 1119, 632, 31684, 311, 336, 71167, 4137, 1927, 239, 644, 189, 30050, 210, 86153, 189, 30050, 250, 86153, 603, 5306, 33249, 86153, 189, 1, 189, 2807, 1, 51596, 189, 2175, 6747, 5148, 3403, 722, 34975, 2681, 532, 29315, 86153, 336, 6475, 2881, 71, 530, 44381, 239, 105442, 250, 2881, 71, 250, 44381, 232, 189, 40440, 4143, 1306, 1152, 12491] # fmt: skip
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = TokenizersBackend.from_pretrained("bigscience/tokenizer")
tokenizer.save_pretrained(cls.tmpdirname)
cls.tokenizers_list = [(cls.rust_tokenizer_class, cls.tmpdirname, {})]
def test_encodings_from_sample_data(self):
"""
Assert that the created tokens are the same than the hard-coded ones
"""
tokenizer = self.get_tokenizer()
INPUT_SENTENCES = ["The quick brown fox</s>", "jumps over the lazy dog</s>"]
TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]]
computed_tokens = tokenizer(INPUT_SENTENCES)["input_ids"]
self.assertListEqual(TARGET_TOKENS, computed_tokens)
decoded_tokens = tokenizer.decode(computed_tokens)
self.assertListEqual(decoded_tokens, INPUT_SENTENCES)
def test_padding(self, max_length=6):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.get_tokenizer(pretrained_name, **kwargs)
# tokenizer_r.pad_token = None # Hotfixing padding = None
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
try:
tokenizer_r.encode(s, max_length=max_length)
tokenizer_r(s, max_length=max_length)
tokenizer_r(s2, max_length=max_length)
tokenizer_r.encode(p, max_length=max_length)
tokenizer_r(p2, max_length=max_length)
except ValueError:
self.fail("Bloom Tokenizer should be able to deal with padding")
tokenizer_r.pad_token = None # Hotfixing padding = None
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r,
p2,
max_length=max_length,
padding="max_length",
)
def test_encodings_from_xnli_dataset(self):
"""
Tests the tokenizer downloaded from here:
- https://huggingface.co/bigscience/tokenizer/
"""
tokenizer = self.get_tokenizer()
ds = load_dataset("facebook/xnli", "all_languages", split="test", streaming=True)
sample_data = next(iter(ds))["premise"] # pick up one data
input_text = list(sample_data.values())
output_tokens = list(map(tokenizer.encode, input_text))
predicted_text = [tokenizer.decode(x, clean_up_tokenization_spaces=False) for x in output_tokens]
self.assertListEqual(predicted_text, input_text)
@slow
def test_save_and_load_tokenizer(self):
return super().test_save_and_load_tokenizer()

View File

View File

@@ -0,0 +1,423 @@
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Blt model."""
import unittest
import pytest
from parameterized import parameterized
from transformers import AutoTokenizer, is_torch_available
from transformers.testing_utils import (
Expectations,
cleanup,
require_torch,
require_torch_accelerator,
require_torch_bf16,
slow,
torch_device,
)
from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
from ...test_modeling_common import (
TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
_test_eager_matches_sdpa_inference,
)
if is_torch_available():
import torch
from transformers import BltConfig, BltForCausalLM, BltModel
class BltModelTester(CausalLMModelTester):
if is_torch_available():
base_model_class = BltModel
def __init__(
self,
parent,
ignore_index=-100,
seq_length=7,
is_training=True,
):
super().__init__(parent)
self.parent = parent
self.ignore_index = ignore_index
self.seq_length = seq_length
self.is_training = is_training
self.batch_size = 3
# Common parameters for all configs
self.hidden_size = 16
self.num_hidden_layers = 1
self.num_attention_heads = 2
self.num_key_value_heads = 2
self.intermediate_size = 32
self.hidden_act = "silu"
self.max_position_embeddings = 32
self.vocab_size = 32
self.rope_theta = 500000.0
self.rope_parameters = {"rope_type": "default"}
self.rms_norm_eps = 1e-5
self.dropout = 0.0
self.encoder_hash_byte_group_size = [2, 3]
self.encoder_hash_byte_group_vocab = 64
self.encoder_hash_byte_group_nb_functions = 1
# Common parameters for all configs
self.patcher_config = {
"hidden_size": self.hidden_size,
"num_hidden_layers": self.num_hidden_layers,
"num_attention_heads": self.num_attention_heads,
"num_key_value_heads": self.num_key_value_heads,
"intermediate_size": self.intermediate_size,
"max_position_embeddings": self.max_position_embeddings,
"rope_theta": self.rope_theta,
"rope_parameters": self.rope_parameters,
"hidden_act": self.hidden_act,
"rms_norm_eps": self.rms_norm_eps,
"dropout": self.dropout,
}
self.encoder_config = {
"hidden_size": self.hidden_size,
"num_hidden_layers": self.num_hidden_layers,
"num_attention_heads": self.num_attention_heads,
"num_key_value_heads": self.num_key_value_heads,
"intermediate_size": self.intermediate_size,
"max_position_embeddings": self.max_position_embeddings,
"rope_theta": self.rope_theta,
"rope_parameters": self.rope_parameters,
"hidden_act": self.hidden_act,
"rms_norm_eps": self.rms_norm_eps,
"dropout": self.dropout,
}
self.decoder_config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"hidden_size_global": self.hidden_size * 2, # Must match global transformer output size
"num_hidden_layers": self.num_hidden_layers,
"num_attention_heads": self.num_attention_heads,
"num_key_value_heads": self.num_key_value_heads,
"intermediate_size": self.intermediate_size,
"max_position_embeddings": self.max_position_embeddings,
"rope_theta": self.rope_theta,
"rope_parameters": self.rope_parameters,
"hidden_act": self.hidden_act,
"rms_norm_eps": self.rms_norm_eps,
"dropout": self.dropout,
}
self.global_config = {
"hidden_size": self.hidden_size * 2, # Double the hidden size for global transformer
"num_hidden_layers": self.num_hidden_layers,
"num_attention_heads": self.num_attention_heads,
"num_key_value_heads": self.num_key_value_heads,
"intermediate_size": self.intermediate_size,
"max_position_embeddings": self.max_position_embeddings,
"rope_theta": self.rope_theta,
"rope_parameters": self.rope_parameters,
"hidden_act": self.hidden_act,
"rms_norm_eps": self.rms_norm_eps,
"dropout": self.dropout,
}
self.num_hidden_layers = self.encoder_config["num_hidden_layers"]
def get_config(self):
config = BltConfig(
vocab_size=self.vocab_size,
max_position_embeddings=self.max_position_embeddings,
patch_in_forward=False, # Disable patching for tests
patch_size=4,
patching_mode="entropy",
patching_threshold=1.335442066192627,
patching_batch_size=1,
max_patch_length=None,
cross_attn_k=2,
encoder_hash_byte_group_size=self.encoder_hash_byte_group_size,
encoder_hash_byte_group_vocab=self.encoder_hash_byte_group_vocab,
encoder_hash_byte_group_nb_functions=self.encoder_hash_byte_group_nb_functions,
patcher_config=self.patcher_config,
encoder_config=self.encoder_config,
decoder_config=self.decoder_config,
global_config=self.global_config,
rope_parameters=self.rope_parameters,
tie_word_embeddings=False,
)
config.num_attention_heads = config.decoder_config.num_attention_heads
config.num_hidden_layers = config.encoder_config.num_hidden_layers
config.hidden_size = config.decoder_config.hidden_size
return config
@require_torch
class BltModelTest(CausalLMModelTest, unittest.TestCase):
model_tester_class = BltModelTester
# Need to use `0.8` instead of `0.9` for `test_cpu_offload`
# This is because we are hitting edge cases with the causal_mask buffer
model_split_percents = [0.5, 0.7, 0.8]
# used in `test_torch_compile_for_training`
_torch_compile_train_cls = BltForCausalLM if is_torch_available() else None
@pytest.mark.generate
@parameterized.expand([("greedy", 1), ("beam search", 2)])
@unittest.skip(
"Blt requires real token IDs for its hash-based embedding computation, making inputs_embeds generation incompatible with identical outputs"
)
def test_generate_from_inputs_embeds(self, _, num_beams):
pass
@pytest.mark.generate
def test_generate_with_quant_cache(self):
self.skipTest("BLT uses EncoderDecoderCache internally and does not support quantized cache")
@pytest.mark.generate
@unittest.skip(
"Blt requires real token IDs for its hash-based embedding computation, making inputs_embeds generation incompatible with identical outputs"
)
def test_inputs_embeds_matches_input_ids(self):
pass
@parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
def test_eager_matches_sdpa_inference(
self,
name,
torch_dtype,
padding_side,
use_attention_mask,
output_attentions,
enable_kernels,
):
"We need to relax a bit the `atols` for fp32 here due to the altup projections"
atols = {
("cpu", False, torch.float32): 2e-2, # this was relaxed
("cpu", False, torch.float16): 5e-3,
("cpu", False, torch.bfloat16): 1e-2,
("cpu", True, torch.float32): 2e-2, # this was relaxed
("cpu", True, torch.float16): 5e-3,
("cpu", True, torch.bfloat16): 1e-2,
("cuda", False, torch.float32): 2e-2, # this was relaxed
("cuda", False, torch.bfloat16): 1e-2,
("cuda", False, torch.float16): 5e-3,
("cuda", True, torch.float32): 2e-2, # this was relaxed
("cuda", True, torch.bfloat16): 1e-2,
("cuda", True, torch.float16): 5e-3,
}
_test_eager_matches_sdpa_inference(
self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels, atols=atols
)
@require_torch_accelerator
@slow
def test_sdpa_can_dispatch_on_flash(self):
self.skipTest("BLT always has an attention_mask input")
@require_torch_accelerator
class BltIntegrationTest(unittest.TestCase):
def setup(self):
cleanup(torch_device, gc_collect=True)
def tearDown(self):
# TODO (joao): automatic compilation, i.e. compilation when `cache_implementation="static"` is used, leaves
# some memory allocated in the cache, which means some object is not being released properly. This causes some
# unoptimal memory usage, e.g. after certain tests a 7B model in FP16 no longer fits in a 24GB GPU.
# Investigate the root cause.
cleanup(torch_device, gc_collect=True)
@slow
def test_model(self):
NUM_TOKENS_TO_GENERATE = 200
EXPECTED_TEXT = "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s"
prompt = "my name is"
model = BltForCausalLM.from_pretrained("itazap/blt-1b-hf", device_map="auto", attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generated_ids = model.generate(
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
)
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXT)
@slow
def test_model_logits(self):
# fmt: off
EXPECTED_OUTPUT = Expectations(
{
(None, None): torch.tensor(
[
[-10.5000, -10.6875, -6.2500, -10.5625, -10.3125, -9.1875, -8.5000, -8.5625, -9.1875, -9.6250, -9.3750, -8.5000, -9.1250, -3.3906, 2.9688, -10.3125, -6.4688, -6.0312, -9.7500, -9.1875, -8.8125, -9.8750, -9.8125, -9.5000, -9.8125, -9.5000, -9.0625, -9.8125, -9.5000, -9.3750],
[-13.2500, -13.1250, -5.6875, -13.1875, -13.3750, -8.6875, -6.9688, -6.9375, -10.0625, -10.3125, -9.8125, -7.7188, -8.8125, -5.2188, -3.5000, -12.4375, -9.0625, -6.6250, -10.3125, -9.1875, -10.6250, -11.4375, -11.1250, -10.8750, -10.5000, -10.8750, -11.0000, -11.3125, -10.5000, -9.8750],
]
),
("xpu", None): torch.tensor(
[
[-10.4375, -10.6875, -6.1875, -10.5000, -10.3125, -9.1250, -8.4375, -8.6250, -9.1875, -9.5625, -9.3125, -8.4375, -9.0625, -3.4375, 2.9531, -10.2500, -6.4062, -6.0000, -9.6875, -9.1875, -8.8125, -9.8125, -9.7500, -9.4375, -9.7500, -9.4375, -9.0000, -9.8125, -9.4375, -9.3125],
[-13.3125, -13.2500, -5.5938, -13.3125, -13.5000, -8.7500, -7.0625, -7.0312, -10.1875, -10.3750, -9.9375, -7.8438, -8.8750, -5.3438, -3.5938, -12.5625, -9.2500, -6.8125, -10.3750, -9.3125, -10.6875, -11.5625, -11.3125, -11.0000, -10.6250, -10.9375, -11.0625, -11.3750, -10.5625, -10.0000],
]
),
}
).get_expectation()
EXPECTED_OUTPUT = EXPECTED_OUTPUT.to(torch_device)
# fmt: on
input_ids = [1, 42, 21, 12, 43, 23, 1, 4]
model = BltForCausalLM.from_pretrained("itazap/blt-1b-hf", attn_implementation="sdpa", device_map="auto")
with torch.no_grad():
output = model(torch.tensor([input_ids]).to(torch_device))[0]
torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30].to(torch_device), rtol=1e-3, atol=1e-3)
@slow
@require_torch_bf16
def test_model_bf16(self):
"""Test Blt model with bfloat16 precision."""
NUM_TOKENS_TO_GENERATE = 200
# fmt: off
EXPECTED_TEXT = Expectations(
{
(None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
("xpu", None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
}
)
# fmt: on
prompt = "my name is"
model = BltForCausalLM.from_pretrained(
"itazap/blt-1b-hf", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generated_ids = model.generate(
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
)
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())
@slow
@require_torch_bf16
def test_model_logits_bf16(self):
"""Test Blt model logits with bfloat16 precision."""
# fmt: off
EXPECTED_OUTPUT = Expectations(
{
(None, None): torch.tensor(
[
[-10.5000, -10.6875, -6.2500, -10.5625, -10.3125, -9.1875, -8.5000, -8.5625, -9.1875, -9.6250, -9.3750, -8.5000, -9.1250, -3.3906, 2.9688, -10.3125, -6.4688, -6.0312, -9.7500, -9.1875, -8.8125, -9.8750, -9.8125, -9.5000, -9.8125, -9.5000, -9.0625, -9.8125, -9.5000, -9.3750],
[-13.2500, -13.1250, -5.6875, -13.1875, -13.3750, -8.6875, -6.9688, -6.9375, -10.0625, -10.3125, -9.8125, -7.7188, -8.8125, -5.2188, -3.5000, -12.4375, -9.0625, -6.6250, -10.3125, -9.1875, -10.6250, -11.4375, -11.1250, -10.8750, -10.5000, -10.8750, -11.0000, -11.3125, -10.5000, -9.8750],
]
),
("xpu", None): torch.tensor(
[
[-10.4375, -10.6875, -6.1875, -10.5000, -10.3125, -9.1250, -8.4375, -8.6250, -9.1875, -9.5625, -9.3125, -8.4375, -9.0625, -3.4375, 2.9531, -10.2500, -6.4062, -6.0000, -9.6875, -9.1875, -8.8125, -9.8125, -9.7500, -9.4375, -9.7500, -9.4375, -9.0000, -9.8125, -9.4375, -9.3125],
[-13.3125, -13.2500, -5.5938, -13.3125, -13.5000, -8.7500, -7.0625, -7.0312, -10.1875, -10.3750, -9.9375, -7.8438, -8.8750, -5.3438, -3.5938, -12.5625, -9.2500, -6.8125, -10.3750, -9.3125, -10.6875, -11.5625, -11.3125, -11.0000, -10.6250, -10.9375, -11.0625, -11.3750, -10.5625, -10.0000],
]
),
}
).get_expectation()
EXPECTED_OUTPUT = EXPECTED_OUTPUT.to(torch_device)
# fmt: on
input_ids = [1, 42, 21, 12, 43, 23, 1, 4]
model = BltForCausalLM.from_pretrained(
"itazap/blt-1b-hf", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.bfloat16
)
with torch.no_grad():
output = model(torch.tensor([input_ids]).to(torch_device))[0]
torch.testing.assert_close(EXPECTED_OUTPUT, output[0, :2, :30].to(torch_device), rtol=1e-3, atol=1e-3)
@slow
def test_model_eager(self):
"""Test Blt model with bfloat16 precision using eager attention implementation."""
NUM_TOKENS_TO_GENERATE = 200
# fmt: off
EXPECTED_TEXT = Expectations(
{
(None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
("xpu", None): "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m",
}
)
# fmt: on
prompt = "my name is"
model = BltForCausalLM.from_pretrained("itazap/blt-1b-hf", device_map="auto", attn_implementation="eager")
tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generated_ids = model.generate(
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
)
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())
@slow
@require_torch_bf16
def test_model_bf16_static_cache(self):
"""Test Blt model with bfloat16 precision and static cache."""
NUM_TOKENS_TO_GENERATE = 200
# fmt: off
EXPECTED_TEXT = Expectations(
{
(None, None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
("xpu", None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
}
)
# fmt: on
prompt = "my name is"
model = BltForCausalLM.from_pretrained(
"itazap/blt-1b-hf", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.bfloat16
)
model.generation_config.cache_implementation = "static"
tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generated_ids = model.generate(
**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, use_cache=False
)
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())

View File

View File

@@ -0,0 +1,155 @@
# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.image_utils import load_image
from transformers.testing_utils import require_torch, require_vision
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
from ...test_processing_common import url_to_local_path
class BridgeTowerImageProcessingTester:
def __init__(
self,
parent,
do_resize: bool = True,
size: dict[str, int] | None = None,
size_divisor: int = 32,
do_rescale: bool = True,
rescale_factor: int | float = 1 / 255,
do_normalize: bool = True,
do_center_crop: bool = True,
image_mean: float | list[float] | None = [0.48145466, 0.4578275, 0.40821073],
image_std: float | list[float] | None = [0.26862954, 0.26130258, 0.27577711],
do_pad: bool = True,
batch_size=7,
min_resolution=30,
max_resolution=400,
num_channels=3,
):
self.parent = parent
self.do_resize = do_resize
self.size = size if size is not None else {"shortest_edge": 288}
self.size_divisor = size_divisor
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_center_crop = do_center_crop
self.image_mean = image_mean
self.image_std = image_std
self.do_pad = do_pad
self.batch_size = batch_size
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
def prepare_image_processor_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_normalize": self.do_normalize,
"do_resize": self.do_resize,
"size": self.size,
"size_divisor": self.size_divisor,
}
def get_expected_values(self, image_inputs, batched=False):
return self.size["shortest_edge"], self.size["shortest_edge"]
def expected_output_image_shape(self, images):
height, width = self.get_expected_values(images, batched=True)
return self.num_channels, height, width
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = BridgeTowerImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "size_divisor"))
@require_vision
@require_torch
def test_backends_equivalence(self):
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
dummy_image = load_image(url_to_local_path("http://images.cocodataset.org/val2017/000000039769.jpg"))
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_image, return_tensors="pt")
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
reference_pixel_values = encodings[reference_backend].pixel_values
reference_pixel_mask = encodings[reference_backend].pixel_mask.float()
for backend_name in backend_names[1:]:
self._assert_tensors_equivalence(reference_pixel_values, encodings[backend_name].pixel_values)
self._assert_tensors_equivalence(reference_pixel_mask, encodings[backend_name].pixel_mask.float())
@require_vision
@require_torch
def test_slow_fast_equivalence_batched(self):
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_images, return_tensors="pt")
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
reference_pixel_values = encodings[reference_backend].pixel_values
reference_pixel_mask = encodings[reference_backend].pixel_mask.float()
for backend_name in backend_names[1:]:
self._assert_tensors_equivalence(reference_pixel_values, encodings[backend_name].pixel_values)
self._assert_tensors_equivalence(reference_pixel_mask, encodings[backend_name].pixel_mask.float())

View File

@@ -0,0 +1,629 @@
# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch BridgeTower model."""
import unittest
from functools import cached_property
from transformers import (
BridgeTowerConfig,
BridgeTowerTextConfig,
BridgeTowerVisionConfig,
is_torch_available,
is_vision_available,
)
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
floats_tensor,
ids_tensor,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
BridgeTowerForContrastiveLearning,
BridgeTowerForImageAndTextRetrieval,
BridgeTowerForMaskedLM,
BridgeTowerModel,
)
if is_vision_available():
from PIL import Image
from transformers import BridgeTowerProcessor
class BridgeTowerTextModelTester:
def __init__(
self,
parent,
hidden_act="gelu",
hidden_size=64,
initializer_factor=1e-10,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=2,
intermediate_size=128,
tie_word_embeddings=False,
output_hidden_states=False,
):
self.parent = parent
self.hidden_act = hidden_act
self.hidden_size = hidden_size
self.initializer_factor = initializer_factor
self.layer_norm_eps = layer_norm_eps
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.tie_word_embeddings = tie_word_embeddings
self.vocab_size = 99
self.seq_length = 4
self.batch_size = 1
self.is_training = False
self.output_hidden_states = output_hidden_states
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
config = self.get_config()
return config, input_ids, attention_mask
def get_config(self):
return BridgeTowerTextConfig(
hidden_act=self.hidden_act,
hidden_size=self.hidden_size,
initializer_factor=self.initializer_factor,
layer_norm_eps=self.layer_norm_eps,
num_attention_heads=self.num_attention_heads,
num_hidden_layers=self.num_hidden_layers,
intermediate_size=self.intermediate_size,
tie_word_embeddings=self.tie_word_embeddings,
output_hidden_states=self.output_hidden_states,
vocab_size=self.vocab_size,
)
class BridgeTowerImageModelTester:
def __init__(
self,
parent,
hidden_size=64,
initializer_factor=1e-10,
layer_norm_eps=1e-05,
num_hidden_layers=2,
init_layernorm_from_vision_encoder=False,
output_hidden_states=False,
image_size=64,
):
self.parent = parent
self.hidden_size = hidden_size
self.initializer_factor = initializer_factor
self.layer_norm_eps = layer_norm_eps
self.num_hidden_layers = num_hidden_layers
self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
self.num_channels = 3
self.num_image_features = 17
self.batch_size = 1
self.image_size = image_size
self.is_training = False
self.output_hidden_states = output_hidden_states
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
pixel_mask = random_attention_mask([self.batch_size, self.image_size, self.image_size])
config = self.get_config()
return config, pixel_values, pixel_mask
def get_config(self):
return BridgeTowerVisionConfig(
hidden_size=self.hidden_size,
initializer_factor=self.initializer_factor,
layer_norm_eps=self.layer_norm_eps,
num_hidden_layers=self.num_hidden_layers,
init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder,
num_channels=self.num_channels,
num_image_features=self.num_image_features,
batch_size=self.batch_size,
image_size=self.image_size,
is_training=self.is_training,
output_hidden_states=self.output_hidden_states,
)
class BridgeTowerModelTester:
def __init__(
self,
parent,
text_kwargs=None,
vision_kwargs=None,
share_cross_modal_transformer_layers=True,
share_link_tower_layers=False,
link_tower_type="add",
init_layernorm_from_vision_encoder=False,
contrastive_hidden_size=512,
logit_scale_init_value=2.6592,
hidden_size=64,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=128,
):
if text_kwargs is None:
text_kwargs = {}
if vision_kwargs is None:
vision_kwargs = {}
self.parent = parent
self.text_model_tester = BridgeTowerTextModelTester(parent, **text_kwargs)
self.vision_model_tester = BridgeTowerImageModelTester(parent, **vision_kwargs)
self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
self.share_link_tower_layers = share_link_tower_layers
self.link_tower_type = link_tower_type
self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
self.contrastive_hidden_size = contrastive_hidden_size
self.logit_scale_init_value = logit_scale_init_value
self.batch_size = 1
self.expected_num_hidden_layers = 8
self.is_training = False
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
vision_config, pixel_values, pixel_mask = self.vision_model_tester.prepare_config_and_inputs()
config = self.get_config()
return (config, input_ids, attention_mask, pixel_values, pixel_mask)
def get_config(self):
return BridgeTowerConfig(
text_config=self.text_model_tester.get_config().to_dict(),
vision_config=self.vision_model_tester.get_config().to_dict(),
share_cross_modal_transformer_layers=self.share_cross_modal_transformer_layers,
share_link_tower_layers=self.share_link_tower_layers,
link_tower_type=self.link_tower_type,
init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder,
contrastive_hidden_size=self.contrastive_hidden_size,
logit_scale_init_value=self.logit_scale_init_value,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
)
def create_and_check_model(
self,
config,
input_ids,
attention_mask,
pixel_values,
pixel_mask,
):
model = BridgeTowerModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
self.parent.assertEqual(
result["text_features"].shape,
(self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.hidden_size),
)
self.parent.assertEqual(
result["image_features"].shape,
(self.batch_size, self.vision_model_tester.num_image_features, self.vision_model_tester.hidden_size),
)
self.parent.assertEqual(
result["pooler_output"].shape,
(self.batch_size, self.text_model_tester.hidden_size + self.vision_model_tester.hidden_size),
)
def create_and_check_for_image_and_text_retrieval(
self,
config,
input_ids,
attention_mask,
pixel_values,
pixel_mask,
):
bridgetower_itm_output_last_dimension = 2
model = BridgeTowerForImageAndTextRetrieval(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
self.parent.assertEqual(result.logits.shape, (self.batch_size, bridgetower_itm_output_last_dimension))
def create_and_check_for_masked_language_modeling(
self,
config,
input_ids,
attention_mask,
pixel_values,
pixel_mask,
):
model = BridgeTowerForMaskedLM(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
self.parent.assertEqual(
result.logits.shape,
(self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.vocab_size),
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, attention_mask, pixel_values, pixel_mask) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"pixel_values": pixel_values,
"pixel_mask": pixel_mask,
}
return config, inputs_dict
@require_torch
class BridgeTowerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(
BridgeTowerModel,
BridgeTowerForImageAndTextRetrieval,
BridgeTowerForMaskedLM,
BridgeTowerForContrastiveLearning,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"feature-extraction": BridgeTowerModel} if is_torch_available() else {}
is_training = False
test_torch_exportable = False
test_resize_embeddings = False
has_attentions = False
@unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
def test_disk_offload(self):
pass
@unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
def test_model_parallelism(self):
pass
# function to extract meaningful tensor from output per different model_class
def extract_output(self, outputs, model_class):
return outputs["pooler_output"] if model_class == "BridgeTowerModel" else outputs["logits"]
def setUp(self):
self.model_tester = BridgeTowerModelTester(self)
self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=32, vocab_size=99)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_image_and_text_retrieval(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_image_and_text_retrieval(*config_and_inputs)
def test_for_masked_language_modeling(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_masked_language_modeling(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
model_name = "BridgeTower/bridgetower-base"
model = BridgeTowerModel.from_pretrained(model_name)
self.assertIsNotNone(model)
# Override this as `hidden states output` is different for BridgeTower
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states_text, hidden_states_vision, hidden_states_cross = (
outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
)
expected_num_layers = self.model_tester.expected_num_hidden_layers
self.assertEqual(
sum((len(hidden_states_text), len(hidden_states_vision), len(hidden_states_cross))),
expected_num_layers,
)
seq_length = self.model_tester.text_model_tester.seq_length
num_image_features = self.model_tester.vision_model_tester.num_image_features
self.assertListEqual(
list(hidden_states_text[0].shape[-2:]),
[seq_length, self.model_tester.text_model_tester.hidden_size],
)
self.assertListEqual(
list(hidden_states_vision[0].shape),
[num_image_features, 1, self.model_tester.vision_model_tester.hidden_size],
)
self.assertListEqual(
list(hidden_states_cross[0][0].shape[-2:]),
[seq_length, self.model_tester.text_model_tester.hidden_size],
)
self.assertListEqual(
list(hidden_states_cross[0][1].shape[-2:]),
[num_image_features, self.model_tester.vision_model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
# Override as `hidden states output` is different for BridgeTower
def test_retain_grad_hidden_states_attentions(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = True
config.output_attentions = self.has_attentions
# no need to test all models as different heads yield the same functionality
model_class = self.all_model_classes[0]
model = model_class(config)
model.to(torch_device)
inputs = self._prepare_for_class(inputs_dict, model_class)
outputs = model(**inputs)
output = outputs[0]
# Encoder-/Decoder-only models
hidden_states = outputs.hidden_states[0][0]
hidden_states.retain_grad()
if self.has_attentions:
attentions = outputs.attentions[0][0]
attentions.retain_grad()
output.flatten()[0].backward(retain_graph=True)
self.assertIsNotNone(hidden_states.grad)
if self.has_attentions:
self.assertIsNotNone(attentions.grad)
@unittest.skip(reason="""Bridge Tower does not have input/output embeddings. So this test is not applicable.""")
def test_model_get_set_embeddings(self):
pass
@unittest.skip(reason="""Bridge Tower does not have input/output embeddings. Thus this test is not applicable.""")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="Bridge Tower does not use inputs_embeds")
def test_inputs_embeds_matches_input_ids(self):
pass
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_torch
@require_vision
class BridgeTowerModelIntegrationTest(unittest.TestCase):
@cached_property
def default_processor(self):
return (
BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
if is_vision_available()
else None
)
@slow
def test_image_and_text_retrieval(self):
model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm").to(
torch_device
)
model.eval()
processor = self.default_processor
image = prepare_img()
text = "a bunch of cats laying on a tower."
inputs = processor(image, text, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size([1, 2])
self.assertEqual(outputs.logits.shape, expected_shape)
self.assertTrue(outputs.logits[0, 1].item() > outputs.logits[0, 0].item())
# verify loss
inputs["labels"] = torch.ones(1, dtype=torch.long, device=torch_device)
inputs = inputs.to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
self.assertAlmostEqual(outputs.loss.item(), 0.5108, places=4)
@slow
def test_masked_language_modeling(self):
model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm").to(torch_device)
model.eval()
processor = self.default_processor
image = prepare_img()
text = "a bunch of <mask> laying on a tower."
inputs = processor(image, text, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size([1, 11, 50265])
self.assertEqual(outputs.logits.shape, expected_shape)
# verify predicted word
predicted_id = outputs.logits.argmax(dim=-1).squeeze(0).tolist()[4]
self.assertTrue(processor.decode([predicted_id]) == " cats")
# verify loss
inputs["labels"] = inputs["input_ids"].clone()
inputs = inputs.to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
self.assertAlmostEqual(outputs.loss.item(), 5.7373, places=4)
@slow
def test_constrastive_learning(self):
model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc").to(
torch_device
)
model.eval()
processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
image = prepare_img()
text = "a bunch of cats laying on a tower."
inputs = processor(image, text, padding=True, return_tensors="pt").to(torch_device)
with torch.no_grad():
outputs = model(**inputs, output_hidden_states=True, return_loss=True)
# verify the logits
expected_shape = torch.Size([1, 3, 512])
self.assertEqual(outputs.logits.shape, expected_shape)
@slow
@require_torch
class BridgeTowerModelTrainingTest(unittest.TestCase):
all_training_supported_model_classes = (
(BridgeTowerForImageAndTextRetrieval, BridgeTowerForMaskedLM, BridgeTowerForContrastiveLearning)
if is_torch_available()
else ()
)
def setUp(self):
self.model_tester = BridgeTowerModelTester(self)
self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=32, vocab_size=99)
def _prepare_inputs_for_training(self, model_class):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
if model_class == BridgeTowerForMaskedLM:
inputs_dict["labels"] = inputs_dict["input_ids"]
elif model_class == BridgeTowerForImageAndTextRetrieval:
inputs_dict["labels"] = ids_tensor([1], 2)
elif model_class == BridgeTowerForContrastiveLearning:
inputs_dict["return_loss"] = True
return config, inputs_dict
def _get_non_used_layer_names(self, model_class):
non_used_layer_names = ["text_model.pooler"]
if model_class == BridgeTowerForMaskedLM:
non_used_layer_names = non_used_layer_names + [
# This number `1` actually depends on the number of layers in `cross_modal_image_layers` (by minus 1)
"cross_modal_image_layers.1",
"cross_modal_image_pooler",
"cross_modal_text_pooler",
]
return non_used_layer_names
def _is_layer_used(self, model_class, layer_name):
non_used_layer_names = self._get_non_used_layer_names(model_class)
for non_used_layer_name in non_used_layer_names:
if non_used_layer_name in layer_name:
return False
return True
def test_training(self):
for model_class in self.all_training_supported_model_classes:
config, inputs_dict = self._prepare_inputs_for_training(model_class)
model = model_class(config)
model.to(torch_device)
model.train()
loss = model(**inputs_dict).loss
loss.backward()
# verify the gradients of used layers' weight are not None
for name, param in model.named_parameters():
if self._is_layer_used(model_class, name):
self.assertIsNotNone(param.grad, f"Gradients should not be None - got {param.grad} for {name}")
@slow
def test_inference_interpolate_pos_encoding(self):
# ViT models have an `interpolate_pos_encoding` argument in their forward method,
# allowing to interpolate the pre-trained position embeddings in order to use
# the model on higher resolutions. The DINO model by Facebook AI leverages this
# to visualize self-attention on higher resolution images.
model_name = "BridgeTower/bridgetower-base"
model = BridgeTowerModel.from_pretrained(model_name).to(torch_device)
image_processor = BridgeTowerProcessor.from_pretrained(model_name, size={"shortest_edge": 180})
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
# interpolate_pos_encodiung false should return value error
with self.assertRaises(ValueError, msg="doesn't match model"):
with torch.no_grad():
model(**inputs, interpolate_pos_encoding=False)
# forward pass
with torch.no_grad():
outputs = model(**inputs, interpolate_pos_encoding=True)
# verify the logits
expected_shape = torch.Size((1, 122, 768))
self.assertEqual(outputs.image_features.shape, expected_shape)
expected_slice = torch.tensor(
[[-0.6518, 0.4978, -0.4544], [-2.6672, -0.0843, -0.4210], [-2.4510, -0.1002, -0.3458]]
).to(torch_device)
torch.testing.assert_close(outputs.image_features[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)

Some files were not shown because too many files have changed in this diff Show More