first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,163 @@
# Copyright 2019-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
import transformers
import transformers.models.auto
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
from transformers.models.bert.configuration_bert import BertConfig
from transformers.models.roberta.configuration_roberta import RobertaConfig
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
SAMPLE_ROBERTA_CONFIG = get_tests_dir("fixtures/dummy-config.json")
class AutoConfigTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_module_spec(self):
self.assertIsNotNone(transformers.models.auto.__spec__)
self.assertIsNotNone(importlib.util.find_spec("transformers.models.auto"))
def test_config_from_model_shortcut(self):
config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
self.assertIsInstance(config, BertConfig)
def test_config_model_type_from_local_file(self):
config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG)
self.assertIsInstance(config, RobertaConfig)
def test_config_model_type_from_model_identifier(self):
config = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
self.assertIsInstance(config, RobertaConfig)
def test_config_for_model_str(self):
config = AutoConfig.for_model("roberta")
self.assertIsInstance(config, RobertaConfig)
def test_new_config_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
# Wrong model type will raise an error
with self.assertRaises(ValueError):
AutoConfig.register("model", CustomConfig)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoConfig.register("bert", BertConfig)
# Now that the config is registered, it can be used as any other config with the auto-API
config = CustomConfig()
with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(tmp_dir)
new_config = AutoConfig.from_pretrained(tmp_dir)
self.assertIsInstance(new_config, CustomConfig)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
):
_ = AutoConfig.from_pretrained("bert-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_from_pretrained_dynamic_config(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(config.__class__.__name__, "NewModelConfig")
# Test the dynamic module is loaded only once.
reloaded_config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertIs(config.__class__, reloaded_config.__class__)
# Test config can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
config.save_pretrained(tmp_dir)
reloaded_config = AutoConfig.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "configuration.py"))) # Assert we saved config code
# Assert we're pointing at local code and not another remote repo
self.assertEqual(reloaded_config.auto_map["AutoConfig"], "configuration.NewModelConfig")
self.assertEqual(reloaded_config.__class__.__name__, "NewModelConfig")
def test_from_pretrained_dynamic_config_conflict(self):
class NewModelConfigLocal(BertConfig):
model_type = "new-model"
def __init__(self, **kwargs):
super().__init__(**kwargs)
try:
AutoConfig.register("new-model", NewModelConfigLocal)
# If remote code is not set, the default is to use local
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model")
self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
# If remote code is disabled, we load the local one.
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(config.__class__.__name__, "NewModelConfigLocal")
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewModelConfigLocal.__module__ = "transformers.models.new_model.configuration_new_model"
config = AutoConfig.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(config.__class__.__name__, "NewModelConfig")
finally:
if "new-model" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["new-model"]
def test_config_missing_model_type(self):
with tempfile.TemporaryDirectory() as tmp_dir:
config_dict = {
"hidden_size": 768,
"num_attention_heads": 12,
"num_hidden_layers": 12,
}
config_path = os.path.join(tmp_dir, "config.json")
with open(config_path, "w") as f:
json.dump(config_dict, f)
with self.assertRaisesRegex(ValueError, "Should have a `model_type` key"):
AutoConfig.from_pretrained(tmp_dir)

View File

@@ -0,0 +1,196 @@
# Copyright 2021 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
import transformers
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
AutoConfig,
AutoFeatureExtractor,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
)
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_feature_extraction import CustomFeatureExtractor # noqa E402
SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
SAMPLE_FEATURE_EXTRACTION_CONFIG = get_tests_dir("fixtures/dummy_feature_extractor_config.json")
SAMPLE_CONFIG = get_tests_dir("fixtures/dummy-config.json")
class AutoFeatureExtractorTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_feature_extractor_from_model_shortcut(self):
config = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def test_feature_extractor_from_local_directory_from_key(self):
config = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def test_feature_extractor_from_local_directory_from_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = Wav2Vec2Config()
# remove feature_extractor_type to make sure config.json alone is enough to load feature processor locally
config_dict = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR).to_dict()
config_dict.pop("feature_extractor_type")
config = Wav2Vec2FeatureExtractor(**config_dict)
# save in new folder
model_config.save_pretrained(tmpdirname)
config.save_pretrained(tmpdirname)
config = AutoFeatureExtractor.from_pretrained(tmpdirname)
# make sure private variable is not incorrectly saved
dict_as_saved = json.loads(config.to_json_string())
self.assertTrue("_processor_class" not in dict_as_saved)
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def test_feature_extractor_from_local_file(self):
config = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG)
self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
):
_ = AutoFeatureExtractor.from_pretrained("bert-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoFeatureExtractor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_feature_extractor_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"Can't load feature extractor for 'hf-internal-testing/config-no-model'.",
):
_ = AutoFeatureExtractor.from_pretrained("hf-internal-testing/config-no-model")
def test_from_pretrained_dynamic_feature_extractor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor"
)
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
# Test the dynamic module is loaded only once.
reloaded_feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertIs(feature_extractor.__class__, reloaded_feature_extractor.__class__)
# Test feature extractor can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
feature_extractor.save_pretrained(tmp_dir)
reloaded_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "feature_extractor.py"))) # Assert we saved code
self.assertEqual(
reloaded_feature_extractor.auto_map["AutoFeatureExtractor"], "feature_extractor.NewFeatureExtractor"
)
self.assertEqual(reloaded_feature_extractor.__class__.__name__, "NewFeatureExtractor")
def test_new_feature_extractor_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoFeatureExtractor.register(Wav2Vec2Config, Wav2Vec2FeatureExtractor)
# Now that the config is registered, it can be used as any other config with the auto-API
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
with tempfile.TemporaryDirectory() as tmp_dir:
feature_extractor.save_pretrained(tmp_dir)
new_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir)
self.assertIsInstance(new_feature_extractor, CustomFeatureExtractor)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_feature_extractor_conflict(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
is_local = True
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
# If remote code is not set, the default is to use local
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor"
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(feature_extractor.is_local)
# If remote code is disabled, we load the local one.
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=False
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(feature_extractor.is_local)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(feature_extractor.is_local)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewFeatureExtractor.__module__ = "transformers.models.custom.configuration_custom"
feature_extractor = AutoFeatureExtractor.from_pretrained(
"hf-internal-testing/test_dynamic_feature_extractor", trust_remote_code=True
)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
self.assertTrue(not hasattr(feature_extractor, "is_local"))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]

View File

@@ -0,0 +1,371 @@
# Copyright 2021 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
import transformers
from transformers import (
CONFIG_MAPPING,
IMAGE_PROCESSOR_MAPPING,
AutoConfig,
AutoImageProcessor,
CLIPConfig,
CLIPImageProcessor,
ViTImageProcessor,
ViTImageProcessorPil,
)
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torchvision, require_vision
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_image_processing import CustomImageProcessor # noqa E402
class AutoImageProcessorTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
@require_torchvision
def test_image_processor_from_model_shortcut(self):
config = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_local_directory_from_key(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
config = AutoImageProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_local_directory_from_feature_extractor_key(self):
# Ensure we can load the image processor from the feature extractor config
# Though we don't have any `CLIPFeatureExtractor` class, we can't be sure that
# there are no models in the hub serialized with `processor_type=CLIPFeatureExtractor`
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"feature_extractor_type": "CLIPFeatureExtractor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
config = AutoImageProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_new_filename(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
config = AutoImageProcessor.from_pretrained(tmpdirname)
# Now loading fast image processor by default
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_local_directory_from_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = CLIPConfig()
# Create a dummy config file with image_processor_type
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
# remove image_processor_type to make sure config.json alone is enough to load image processor locally
config_dict = AutoImageProcessor.from_pretrained(tmpdirname).to_dict()
config_dict.pop("image_processor_type")
config = CLIPImageProcessor(**config_dict)
# save in new folder
model_config.save_pretrained(tmpdirname)
config.save_pretrained(tmpdirname)
config = AutoImageProcessor.from_pretrained(tmpdirname)
# make sure private variable is not incorrectly saved
dict_as_saved = json.loads(config.to_json_string())
self.assertTrue("_processor_class" not in dict_as_saved)
self.assertIsInstance(config, CLIPImageProcessor)
@require_torchvision
def test_image_processor_from_local_file(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump(
{"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
config = AutoImageProcessor.from_pretrained(processor_tmpfile)
self.assertIsInstance(config, CLIPImageProcessor)
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "clip-base is not a local folder and is not a valid model identifier"
):
_ = AutoImageProcessor.from_pretrained("clip-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoImageProcessor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_image_processor_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"Can't load image processor for 'hf-internal-testing/config-no-model'.",
):
_ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
@require_vision
@require_torchvision
def test_use_fast_selection(self):
checkpoint = "hf-internal-testing/tiny-random-vit"
# Fast image processor is selected by default
image_processor = AutoImageProcessor.from_pretrained(checkpoint)
self.assertIsInstance(image_processor, ViTImageProcessor)
# Fast image processor is selected when use_fast=True
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True)
self.assertIsInstance(image_processor, ViTImageProcessor)
# Slow image processor is selected when use_fast=False
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=False)
self.assertIsInstance(image_processor, ViTImageProcessorPil)
def test_from_pretrained_dynamic_image_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
)
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
# Test the dynamic module is loaded only once.
reloaded_image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertIs(image_processor.__class__, reloaded_image_processor.__class__)
# Test image processor can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
image_processor.save_pretrained(tmp_dir)
reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "image_processor.py"))) # Assert we saved custom code
self.assertEqual(
reloaded_image_processor.auto_map["AutoImageProcessor"], "image_processor.NewImageProcessor"
)
self.assertEqual(reloaded_image_processor.__class__.__name__, "NewImageProcessor")
# Test the dynamic module is reloaded if we force it.
reloaded_image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True, force_download=True
)
self.assertIsNot(image_processor.__class__, reloaded_image_processor.__class__)
def test_new_image_processor_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoImageProcessor.register(CustomConfig, CustomImageProcessor)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoImageProcessor.register(CLIPConfig, CLIPImageProcessor)
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{"feature_extractor_type": "CLIPFeatureExtractor", "processor_class": "CLIPProcessor"},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
image_processor = CustomImageProcessor.from_pretrained(tmpdirname)
# Now that the config is registered, it can be used as any other config with the auto-API
with tempfile.TemporaryDirectory() as tmp_dir:
image_processor.save_pretrained(tmp_dir)
new_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
self.assertIsInstance(new_image_processor, CustomImageProcessor)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_image_processor_conflict(self):
class NewImageProcessor(CLIPImageProcessor):
is_local = True
try:
AutoConfig.register("custom", CustomConfig)
AutoImageProcessor.register(CustomConfig, NewImageProcessor)
# If remote code is not set, the default is to use local
image_processor = AutoImageProcessor.from_pretrained("hf-internal-testing/test_dynamic_image_processor")
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(image_processor.is_local)
# If remote code is disabled, we load the local one.
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=False
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(image_processor.is_local)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(image_processor.is_local)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewImageProcessor.__module__ = "transformers.models.custom.configuration_custom"
image_processor = AutoImageProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
)
self.assertEqual(image_processor.__class__.__name__, "NewImageProcessor")
self.assertTrue(not hasattr(image_processor, "is_local"))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
@require_vision
def test_backend_kwarg_pil(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "ViTImageProcessor"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="pil")
self.assertIsInstance(image_processor, ViTImageProcessorPil)
@require_torchvision
def test_backend_kwarg_torchvision(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "ViTImageProcessor"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="torchvision")
self.assertIsInstance(image_processor, ViTImageProcessor)
@require_torchvision
def test_default_to_pil_backend_for_lanczos_processors(self):
# Even when torchvision is available, processors that rely on Lanczos interpolation
# (listed in DEFAULT_TO_PIL_BACKEND_IMAGE_PROCESSORS) must default to the PIL backend
# when backend='auto'.
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "FlavaImageProcessor"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname)
self.assertEqual(type(image_processor).__name__, "FlavaImageProcessorPil")
@require_torchvision
def test_explicit_backend_overrides_lanczos_default(self):
# An explicit backend="torchvision" must bypass the DEFAULT_TO_PIL_BACKEND_IMAGE_PROCESSORS
# override; only the auto-resolved backend is affected by the list.
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "FlavaImageProcessor"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="torchvision")
self.assertEqual(type(image_processor).__name__, "FlavaImageProcessor")
@require_torchvision
def test_legacy_fast_class_name_in_config(self):
# Checkpoints saved before the rename used names like "ViTImageProcessorFast".
# The *Fast suffix must be stripped and the correct backend variant returned.
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
json.dump({"image_processor_type": "ViTImageProcessorFast"}, open(processor_tmpfile, "w"))
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="torchvision")
self.assertIsInstance(image_processor, ViTImageProcessor)
image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="pil")
self.assertIsInstance(image_processor, ViTImageProcessorPil)
@require_vision
def test_register_with_image_processor_classes_dict(self):
# New image_processor_classes={} dict API for register().
try:
AutoImageProcessor.register(CustomConfig, image_processor_classes={"pil": CustomImageProcessor})
with tempfile.TemporaryDirectory() as tmp_dir:
json.dump(
{"image_processor_type": "CustomImageProcessor"},
open(Path(tmp_dir) / "preprocessor_config.json", "w"),
)
image_processor = AutoImageProcessor.from_pretrained(tmp_dir, backend="pil")
self.assertIsInstance(image_processor, CustomImageProcessor)
finally:
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
@require_vision
def test_register_legacy_slow_fast_params(self):
# slow_image_processor_class= and fast_image_processor_class= are deprecated but
# must still work; they map to "pil" and "torchvision" backends respectively.
try:
AutoImageProcessor.register(CustomConfig, slow_image_processor_class=CustomImageProcessor)
with tempfile.TemporaryDirectory() as tmp_dir:
json.dump(
{"image_processor_type": "CustomImageProcessor"},
open(Path(tmp_dir) / "preprocessor_config.json", "w"),
)
image_processor = AutoImageProcessor.from_pretrained(tmp_dir, backend="pil")
self.assertIsInstance(image_processor, CustomImageProcessor)
finally:
if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]

View File

@@ -0,0 +1,599 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
import os
import sys
import tempfile
import unittest
from collections import OrderedDict
from pathlib import Path
import pytest
import transformers
from transformers import BertConfig, GPT2Model, is_torch_available
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
from transformers.testing_utils import (
DUMMY_UNKNOWN_IDENTIFIER,
RequestCounter,
require_peft,
require_torch,
slow,
)
from transformers.utils import ADAPTER_CONFIG_NAME
from ..bert.test_modeling_bert import BertModelTester
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
if is_torch_available():
import torch
from test_module.custom_modeling import CustomModel
from transformers import (
AutoBackbone,
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForMaskedLM,
AutoModelForPreTraining,
AutoModelForQuestionAnswering,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoModelForTableQuestionAnswering,
AutoModelForTokenClassification,
BertForMaskedLM,
BertForPreTraining,
BertForQuestionAnswering,
BertForSequenceClassification,
BertForTokenClassification,
BertModel,
FunnelBaseModel,
FunnelModel,
GenerationMixin,
GPT2Config,
GPT2LMHeadModel,
ResNetBackbone,
T5Config,
T5ForConditionalGeneration,
TapasConfig,
TapasForQuestionAnswering,
TimmBackbone,
)
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
MODEL_FOR_PRETRAINING_MAPPING,
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_MAPPING,
)
@require_torch
class AutoModelTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
@slow
def test_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModel.from_pretrained(model_name)
model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertModel)
self.assertEqual(len(loading_info["missing_keys"]), 0)
# When using PyTorch checkpoint, the expected value is `8`. With `safetensors` checkpoint (if it is
# installed), the expected value becomes `7`.
EXPECTED_NUM_OF_UNEXPECTED_KEYS = 7
self.assertEqual(len(loading_info["unexpected_keys"]), EXPECTED_NUM_OF_UNEXPECTED_KEYS)
self.assertEqual(len(loading_info["mismatched_keys"]), 0)
self.assertEqual(len(loading_info["error_msgs"]), 0)
@slow
def test_model_for_pretraining_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForPreTraining.from_pretrained(model_name)
model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForPreTraining)
# Only one value should not be initialized and in the missing keys.
for value in loading_info.values():
self.assertEqual(len(value), 0)
@slow
def test_model_for_causal_lm(self):
model_name = "openai-community/gpt2"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, GPT2Config)
model = AutoModelForCausalLM.from_pretrained(model_name)
model, loading_info = AutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, GPT2LMHeadModel)
@slow
def test_model_for_masked_lm(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForMaskedLM.from_pretrained(model_name)
model, loading_info = AutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForMaskedLM)
@slow
def test_model_for_encoder_decoder_lm(self):
model_name = "google-t5/t5-base"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, T5Config)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model, loading_info = AutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, T5ForConditionalGeneration)
@slow
def test_sequence_classification_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForSequenceClassification)
@slow
def test_question_answering_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForQuestionAnswering)
@slow
def test_table_question_answering_model_from_pretrained(self):
model_name = "google/tapas-base"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, TapasConfig)
model = AutoModelForTableQuestionAnswering.from_pretrained(model_name)
model, loading_info = AutoModelForTableQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TapasForQuestionAnswering)
@slow
def test_token_classification_model_from_pretrained(self):
model_name = "google-bert/bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model, loading_info = AutoModelForTokenClassification.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForTokenClassification)
@slow
def test_auto_backbone_timm_model_from_pretrained(self):
# Configs can't be loaded for timm models
model = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True)
with pytest.raises(ValueError):
# We can't pass output_loading_info=True as we're loading from timm
AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TimmBackbone)
# Check kwargs are correctly passed to the backbone
model = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, out_indices=(-2, -1))
self.assertEqual(model.out_indices, [-2, -1])
# Check out_features cannot be passed to Timm backbones
with self.assertRaises(ValueError):
_ = AutoBackbone.from_pretrained("resnet18", use_timm_backbone=True, out_features=["stage1"])
@slow
def test_auto_backbone_from_pretrained(self):
model = AutoBackbone.from_pretrained("microsoft/resnet-18")
model, loading_info = AutoBackbone.from_pretrained("microsoft/resnet-18", output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, ResNetBackbone)
# Check kwargs are correctly passed to the backbone
model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_indices=[-2, -1])
self.assertEqual(model.out_indices, [-2, -1])
self.assertEqual(model.out_features, ["stage3", "stage4"])
model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_features=["stage2", "stage4"])
self.assertEqual(model.out_indices, [2, 4])
self.assertEqual(model.out_features, ["stage2", "stage4"])
def test_from_pretrained_with_tuple_values(self):
# For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
model = AutoModel.from_pretrained("sgugger/funnel-random-tiny")
self.assertIsInstance(model, FunnelModel)
config = copy.deepcopy(model.config)
config.architectures = ["FunnelBaseModel"]
model = AutoModel.from_config(config)
self.assertIsInstance(model, FunnelBaseModel)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
model = AutoModel.from_pretrained(tmp_dir)
self.assertIsInstance(model, FunnelBaseModel)
def test_from_pretrained_dynamic_model_local(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoModel.register(CustomConfig, CustomModel)
config = CustomConfig(hidden_size=32)
model = CustomModel(config)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
new_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
for p1, p2 in zip(model.parameters(), new_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in MODEL_MAPPING._extra_content:
del MODEL_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_model_distant(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test the dynamic module is loaded only once.
reloaded_model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertIs(model.__class__, reloaded_model.__class__)
# Test model can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
# Test the dynamic module is reloaded if we force it.
reloaded_model = AutoModel.from_pretrained(
"hf-internal-testing/test_dynamic_model", trust_remote_code=True, force_download=True
)
self.assertIsNot(model.__class__, reloaded_model.__class__)
# This one uses a relative import to a util file, this checks it is downloaded and used properly.
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model_with_util", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test the dynamic module is loaded only once.
reloaded_model = AutoModel.from_pretrained(
"hf-internal-testing/test_dynamic_model_with_util", trust_remote_code=True
)
self.assertIs(model.__class__, reloaded_model.__class__)
# Test model can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
# Test the dynamic module is reloaded if we force it.
reloaded_model = AutoModel.from_pretrained(
"hf-internal-testing/test_dynamic_model_with_util", trust_remote_code=True, force_download=True
)
self.assertIsNot(model.__class__, reloaded_model.__class__)
def test_from_pretrained_dynamic_model_distant_with_ref(self):
model = AutoModel.from_pretrained("hf-internal-testing/ref_to_test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test model can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
# This one uses a relative import to a util file, this checks it is downloaded and used properly.
model = AutoModel.from_pretrained(
"hf-internal-testing/ref_to_test_dynamic_model_with_util", trust_remote_code=True
)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test model can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
reloaded_model = AutoModel.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_model.__class__.__name__, "NewModel")
for p1, p2 in zip(model.parameters(), reloaded_model.parameters()):
self.assertTrue(torch.equal(p1, p2))
def test_from_pretrained_dynamic_model_with_period(self):
# We used to have issues where repos with "." in the name would cause issues because the Python
# import machinery would treat that as a directory separator, so we test that case
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model_v1.0")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model_v1.0", trust_remote_code=False)
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model_v1.0", trust_remote_code=True)
self.assertEqual(model.__class__.__name__, "NewModel")
# Test that it works with a custom cache dir too
with tempfile.TemporaryDirectory() as tmp_dir:
with unittest.mock.patch.dict(os.environ, {"HF_XET_CACHE": tmp_dir}):
model = AutoModel.from_pretrained(
"hf-internal-testing/test_dynamic_model_v1.0", trust_remote_code=True, cache_dir=tmp_dir
)
self.assertEqual(model.__class__.__name__, "NewModel")
def test_new_model_registration(self):
AutoConfig.register("custom", CustomConfig)
auto_classes = [
AutoModel,
AutoModelForCausalLM,
AutoModelForMaskedLM,
AutoModelForPreTraining,
AutoModelForQuestionAnswering,
AutoModelForSequenceClassification,
AutoModelForTokenClassification,
]
try:
for auto_class in auto_classes:
with self.subTest(auto_class.__name__):
# Wrong config class will raise an error
with self.assertRaises(ValueError):
auto_class.register(BertConfig, CustomModel)
auto_class.register(CustomConfig, CustomModel)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
auto_class.register(BertConfig, BertModel)
# Now that the config is registered, it can be used as any other config with the auto-API
tiny_config = BertModelTester(self).get_config()
config = CustomConfig(**tiny_config.to_dict())
model = auto_class.from_config(config)
self.assertIsInstance(model, CustomModel)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
new_model = auto_class.from_pretrained(tmp_dir)
# The model is a CustomModel but from the new dynamically imported class.
self.assertIsInstance(new_model, CustomModel)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
for mapping in (
MODEL_MAPPING,
MODEL_FOR_PRETRAINING_MAPPING,
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
):
if CustomConfig in mapping._extra_content:
del mapping._extra_content[CustomConfig]
def test_from_pretrained_dynamic_model_conflict(self):
class NewModelConfigLocal(BertConfig):
model_type = "new-model"
def __init__(self, **kwargs):
super().__init__(**kwargs)
class NewModel(BertModel):
config_class = NewModelConfigLocal
try:
AutoConfig.register("new-model", NewModelConfigLocal)
AutoModel.register(NewModelConfigLocal, NewModel)
# If remote code is not set, the default is to use local
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model")
self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
# If remote code is disabled, we load the local one.
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=False)
self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.config.__class__.__name__, "NewModelConfigLocal")
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewModelConfigLocal.__module__ = "transformers.models.new_model.configuration_new_model"
NewModel.__module__ = "transformers.models.new_model.modeling_new_model"
model = AutoModel.from_pretrained("hf-internal-testing/test_dynamic_model", trust_remote_code=True)
self.assertEqual(model.config.__class__.__name__, "NewModelConfig")
finally:
if "new-model" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["new-model"]
if NewModelConfigLocal in MODEL_MAPPING._extra_content:
del MODEL_MAPPING._extra_content[NewModelConfigLocal]
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
):
_ = AutoModel.from_pretrained("bert-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoModel.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
@unittest.skip("Failing on main")
def test_cached_model_has_minimum_calls_to_head(self):
# Make sure we have cached the model.
_ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
with RequestCounter() as counter:
_ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(counter["GET"], 0)
self.assertEqual(counter["HEAD"], 1)
self.assertEqual(counter.total_calls, 1)
# With a sharded checkpoint
_ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
with RequestCounter() as counter:
_ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
self.assertEqual(counter["GET"], 0)
self.assertEqual(counter["HEAD"], 1)
self.assertEqual(counter.total_calls, 1)
def test_attr_not_existing(self):
from transformers.models.auto.auto_factory import _LazyAutoMapping
_CONFIG_MAPPING_NAMES = OrderedDict([("bert", "BertConfig")])
_MODEL_MAPPING_NAMES = OrderedDict([("bert", "GhostModel")])
_MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
with pytest.raises(ValueError, match=r"Could not find GhostModel neither in .* nor in .*!"):
_MODEL_MAPPING[BertConfig]
_MODEL_MAPPING_NAMES = OrderedDict([("bert", "BertModel")])
_MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
self.assertEqual(_MODEL_MAPPING[BertConfig], BertModel)
_MODEL_MAPPING_NAMES = OrderedDict([("bert", "GPT2Model")])
_MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
self.assertEqual(_MODEL_MAPPING[BertConfig], GPT2Model)
def test_custom_model_patched_generation_inheritance(self):
"""
Tests that our inheritance patching for generate-compatible models works as expected. Without this feature,
old Hub models lose the ability to call `generate`.
"""
model = AutoModelForCausalLM.from_pretrained(
"hf-internal-testing/test_dynamic_model_generation", trust_remote_code=True
)
self.assertTrue(model.__class__.__name__ == "NewModelForCausalLM")
# It inherits from GenerationMixin. This means it can `generate`. Because `PreTrainedModel` is scheduled to
# stop inheriting from `GenerationMixin` in v4.50, this check will fail if patching is not present.
self.assertTrue(isinstance(model, GenerationMixin))
# More precisely, it directly inherits from GenerationMixin. This check would fail prior to v4.45 (inheritance
# patching was added in v4.45)
self.assertTrue("GenerationMixin" in str(model.__class__.__bases__))
@unittest.skip("@Cyril: add the post_init() on the hub repo")
def test_model_with_dotted_name_and_relative_imports(self):
"""
Test for issue #40496: AutoModel.from_pretrained() doesn't work for models with '.' in their name
when there's a relative import.
Without the fix, this raises: ModuleNotFoundError:
No module named 'transformers_modules.hf-internal-testing.remote_code_model_with_dots_v1'
"""
model_id = "hf-internal-testing/remote_code_model_with_dots_v1.0"
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
self.assertIsNotNone(model)
@require_peft
def test_adapter_path_not_overwritten_for_complete_model(self):
"""
Test for issue #43746: Only overwrite the pretrained_model_name_or_path if needed with adapter.
This test ensures that when a model has an adapter config and the pretrained_model_name_or_path
points to a model directory with both a base model and an embedded adapter, the path should NOT
be overwritten with the hub model name embedded in the adapter's config.
The bug was that the path was being unconditionally overwritten, which would cause
incorrect behavior when loading models with adapters that are embedded within the
same directory as the base model.
"""
peft_test_model = "peft-internal-testing/tiny-OPTForCausalLM-lora"
transformers_test_model = "hf-internal-testing/tiny-random-OPTForCausalLM"
# Create a temporary directory with a complete adapter model structure
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_dir = Path(tmp_dir)
# Save the model and adapter locally
config = AutoConfig.from_pretrained(transformers_test_model)
model = AutoModel.from_pretrained(transformers_test_model)
adapter_model = AutoModel.from_pretrained(peft_test_model)
config.save_pretrained(tmp_dir)
model.save_pretrained(tmp_dir)
adapter_model.save_pretrained(tmp_dir)
# Overwrite the base_model_name_or_path to an invalid value that
# would cause the load to fail later
adapter_config_path = tmp_dir / ADAPTER_CONFIG_NAME
with open(adapter_config_path, "r") as handle:
adapter_config = json.load(handle)
adapter_config["base_model_name_or_path"] = "some/model/that/does/not/exist"
with open(adapter_config_path, "w") as handle:
json.dump(adapter_config, handle)
# Load from the saved path and make sure it actually loads despite
# the invalid adapter config path
AutoModel.from_pretrained(tmp_dir)

View File

@@ -0,0 +1,673 @@
# Copyright 2021 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
from shutil import copyfile
from huggingface_hub import snapshot_download, upload_folder
import transformers
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
MODEL_FOR_AUDIO_TOKENIZATION_MAPPING,
PROCESSOR_MAPPING,
TOKENIZER_MAPPING,
AutoConfig,
AutoFeatureExtractor,
AutoProcessor,
AutoTokenizer,
BaseVideoProcessor,
BertTokenizer,
CLIPImageProcessor,
FeatureExtractionMixin,
ImageProcessingMixin,
LlamaTokenizer,
LlavaOnevisionVideoProcessor,
LlavaProcessor,
ProcessorMixin,
SiglipImageProcessor,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
)
from transformers.models.auto.feature_extraction_auto import get_feature_extractor_config
from transformers.models.auto.image_processing_auto import get_image_processor_config
from transformers.models.auto.tokenization_auto import REGISTERED_TOKENIZER_CLASSES
from transformers.models.auto.video_processing_auto import get_video_processor_config
from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test
from transformers.tokenization_python import TOKENIZER_CONFIG_FILE
from transformers.utils import (
FEATURE_EXTRACTOR_NAME,
PROCESSOR_NAME,
)
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_feature_extraction import CustomFeatureExtractor # noqa E402
from test_module.custom_processing import CustomProcessor # noqa E402
from test_module.custom_tokenization import CustomTokenizer # noqa E402
SAMPLE_PROCESSOR_CONFIG = get_tests_dir("fixtures/dummy_feature_extractor_config.json")
SAMPLE_VOCAB_LLAMA = get_tests_dir("fixtures/test_sentencepiece.model")
SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
SAMPLE_CONFIG = get_tests_dir("fixtures/config.json")
SAMPLE_PROCESSOR_CONFIG_DIR = get_tests_dir("fixtures")
class AutoFeatureExtractorTest(unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_processor_from_model_shortcut(self):
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_directory_from_repo(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = Wav2Vec2Config()
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
# save in new folder
model_config.save_pretrained(tmpdirname)
processor.save_pretrained(tmpdirname)
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_subfolder_from_repo(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
processor.save_pretrained(f"{tmpdirname}/processor_subfolder")
processor = Wav2Vec2Processor.from_pretrained(tmpdirname, subfolder="processor_subfolder")
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_directory_from_extractor_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
# copy relevant files
copyfile(SAMPLE_PROCESSOR_CONFIG, os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME))
copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json"))
copyfile(SAMPLE_CONFIG, os.path.join(tmpdirname, "config.json"))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_subcomponent_get_config_dict_saved_as_nested_config(self):
"""
Tests that we can get config dict of a subcomponents of a processor,
even if they were saved as nested dict in `processor_config.json`
"""
# Test feature extractor first
with tempfile.TemporaryDirectory() as tmpdirname:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
processor.save_pretrained(tmpdirname)
config_dict_1 = get_feature_extractor_config(tmpdirname)
feature_extractor_1 = Wav2Vec2FeatureExtractor(**config_dict_1)
self.assertIsInstance(feature_extractor_1, Wav2Vec2FeatureExtractor)
config_dict_2, _ = FeatureExtractionMixin.get_feature_extractor_dict(tmpdirname)
feature_extractor_2 = Wav2Vec2FeatureExtractor(**config_dict_2)
self.assertIsInstance(feature_extractor_2, Wav2Vec2FeatureExtractor)
self.assertEqual(config_dict_1, config_dict_2)
# Test image and video processors next
with tempfile.TemporaryDirectory() as tmpdirname:
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
processor.save_pretrained(tmpdirname)
config_dict_1 = get_image_processor_config(tmpdirname)
image_processor_1 = SiglipImageProcessor(**config_dict_1)
self.assertIsInstance(image_processor_1, SiglipImageProcessor)
config_dict_2, _ = ImageProcessingMixin.get_image_processor_dict(tmpdirname)
image_processor_2 = SiglipImageProcessor(**config_dict_2)
self.assertIsInstance(image_processor_2, SiglipImageProcessor)
self.assertEqual(config_dict_1, config_dict_2)
config_dict_1 = get_video_processor_config(tmpdirname)
video_processor_1 = LlavaOnevisionVideoProcessor(**config_dict_1)
self.assertIsInstance(video_processor_1, LlavaOnevisionVideoProcessor)
config_dict_2, _ = BaseVideoProcessor.get_video_processor_dict(tmpdirname)
video_processor_2 = LlavaOnevisionVideoProcessor(**config_dict_2)
self.assertIsInstance(video_processor_2, LlavaOnevisionVideoProcessor)
self.assertEqual(config_dict_1, config_dict_2)
def test_processor_from_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
# save in new folder
processor.save_pretrained(tmpdirname)
if not os.path.isfile(os.path.join(tmpdirname, PROCESSOR_NAME)):
# create one manually in order to perform this test's objective
config_dict = {"processor_class": "Wav2Vec2Processor"}
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as fp:
json.dump(config_dict, fp)
# drop `processor_class` in tokenizer config
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE)) as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f:
f.write(json.dumps(config_dict))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_tokenizer_processor_class(self):
with tempfile.TemporaryDirectory() as tmpdirname:
feature_extractor = Wav2Vec2FeatureExtractor()
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor, tokenizer)
# save in new folder
processor.save_pretrained(tmpdirname)
# drop `processor_class` in processor
with open(os.path.join(tmpdirname, PROCESSOR_NAME)) as f:
config_dict = json.load(f)
config_dict.pop("processor_class")
with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f:
f.write(json.dumps(config_dict))
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_processor_from_local_directory_from_model_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = Wav2Vec2Config(processor_class="Wav2Vec2Processor")
model_config.save_pretrained(tmpdirname)
# copy relevant files
copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json"))
# create empty sample processor
with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "w") as f:
f.write("{}")
processor = AutoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(processor, Wav2Vec2Processor)
def test_from_pretrained_dynamic_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor_updated")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=False
)
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True
)
self.assertTrue(processor.special_attribute_present)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
feature_extractor = processor.feature_extractor
self.assertTrue(feature_extractor.special_attribute_present)
self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor")
tokenizer = processor.tokenizer
self.assertTrue(tokenizer.special_attribute_present)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
new_processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor", trust_remote_code=True, use_fast=False
)
new_tokenizer = new_processor.tokenizer
self.assertTrue(new_tokenizer.special_attribute_present)
self.assertEqual(new_tokenizer.__class__.__name__, "NewTokenizerFast")
def test_new_processor_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
AutoProcessor.register(CustomConfig, CustomProcessor)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoProcessor.register(Wav2Vec2Config, Wav2Vec2Processor)
# Now that the config is registered, it can be used as any other config with the auto-API
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
with tempfile.TemporaryDirectory() as tmp_dir:
vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = CustomTokenizer(vocab_file)
processor = CustomProcessor(feature_extractor, tokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)
new_processor = AutoProcessor.from_pretrained(tmp_dir)
self.assertIsInstance(new_processor, CustomProcessor)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content:
del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
def test_from_pretrained_dynamic_processor_conflict(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
special_attribute_present = False
class NewTokenizer(BertTokenizer):
special_attribute_present = False
class NewProcessor(ProcessorMixin):
special_attribute_present = False
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor_updated")
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertFalse(processor.special_attribute_present)
self.assertFalse(processor.feature_extractor.special_attribute_present)
self.assertFalse(processor.tokenizer.special_attribute_present)
# If remote code is disabled, we load the local ones.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=False
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertFalse(processor.special_attribute_present)
self.assertFalse(processor.feature_extractor.special_attribute_present)
self.assertFalse(processor.tokenizer.special_attribute_present)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertFalse(processor.special_attribute_present)
self.assertFalse(processor.feature_extractor.special_attribute_present)
self.assertFalse(processor.tokenizer.special_attribute_present)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewFeatureExtractor.__module__ = "transformers.models.custom.feature_extraction_custom"
NewTokenizer.__module__ = "transformers.models.custom.tokenization_custom"
NewProcessor.__module__ = "transformers.models.custom.configuration_custom"
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertTrue(processor.special_attribute_present)
self.assertTrue(processor.feature_extractor.special_attribute_present)
self.assertTrue(processor.tokenizer.special_attribute_present)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content:
del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
def test_from_pretrained_dynamic_processor_with_extra_attributes(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
pass
class NewTokenizer(BertTokenizer):
pass
class NewProcessor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True):
super().__init__(feature_extractor, tokenizer)
self.processor_attr_1 = processor_attr_1
self.processor_attr_2 = processor_attr_2
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated", processor_attr_2=False
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
self.assertEqual(processor.processor_attr_1, 1)
self.assertEqual(processor.processor_attr_2, False)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content:
del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
def test_dynamic_processor_with_specific_dynamic_subcomponents(self):
class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
pass
class NewTokenizer(BertTokenizer):
pass
class NewProcessor(ProcessorMixin):
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
try:
AutoConfig.register("custom", CustomConfig)
AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
AutoProcessor.register(CustomConfig, NewProcessor)
# If remote code is not set, the default is to use local classes.
processor = AutoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_processor_updated",
)
self.assertEqual(processor.__class__.__name__, "NewProcessor")
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
if CustomConfig in PROCESSOR_MAPPING._extra_content:
del PROCESSOR_MAPPING._extra_content[CustomConfig]
if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content:
del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
def test_auto_processor_creates_tokenizer(self):
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(processor.__class__.__name__, "BertTokenizer")
def test_auto_processor_creates_image_processor(self):
processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-convnext")
self.assertEqual(processor.__class__.__name__, "ConvNextImageProcessor")
def test_auto_processor_save_load(self):
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)
second_processor = AutoProcessor.from_pretrained(tmp_dir)
self.assertEqual(second_processor.__class__.__name__, processor.__class__.__name__)
def test_processor_with_multiple_tokenizers_save_load(self):
"""Test that processors with multiple tokenizers save and load correctly."""
class DualTokenizerProcessor(ProcessorMixin):
"""A processor with two tokenizers and an image processor."""
def __init__(self, tokenizer, decoder_tokenizer, image_processor):
super().__init__(tokenizer, decoder_tokenizer, image_processor)
# Create processor with multiple tokenizers
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")
decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
image_processor = SiglipImageProcessor()
processor = DualTokenizerProcessor(
tokenizer=tokenizer,
decoder_tokenizer=decoder_tokenizer,
image_processor=image_processor,
)
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)
# Verify directory structure: primary tokenizer in root, additional in subfolder
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "tokenizer_config.json")))
self.assertTrue(os.path.isdir(os.path.join(tmp_dir, "decoder_tokenizer")))
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "decoder_tokenizer", "tokenizer_config.json")))
# Verify processor_config.json contains image_processor but not tokenizers
with open(os.path.join(tmp_dir, "processor_config.json")) as f:
processor_config = json.load(f)
self.assertIn("image_processor", processor_config)
self.assertNotIn("tokenizer", processor_config)
self.assertNotIn("decoder_tokenizer", processor_config)
# Reload the full processor and verify all attributes
loaded_processor = DualTokenizerProcessor.from_pretrained(tmp_dir)
# Verify the processor has all expected attributes
self.assertTrue(hasattr(loaded_processor, "tokenizer"))
self.assertTrue(hasattr(loaded_processor, "decoder_tokenizer"))
self.assertTrue(hasattr(loaded_processor, "image_processor"))
# Verify tokenizers loaded correctly
self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size)
self.assertEqual(loaded_processor.decoder_tokenizer.vocab_size, decoder_tokenizer.vocab_size)
# Verify image processor loaded correctly
self.assertEqual(loaded_processor.image_processor.size, image_processor.size)
def test_processor_with_multiple_image_processors_save_load(self):
"""Test that processors with multiple image processors save and load correctly."""
class DualImageProcessorProcessor(ProcessorMixin):
"""A processor with two image processors and a tokenizer."""
def __init__(self, tokenizer, image_processor, encoder_image_processor):
super().__init__(tokenizer, image_processor, encoder_image_processor)
# Create processor with multiple image processors
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")
image_processor = SiglipImageProcessor(size={"height": 224, "width": 224})
encoder_image_processor = CLIPImageProcessor(size={"height": 384, "width": 384})
processor = DualImageProcessorProcessor(
tokenizer=tokenizer,
image_processor=image_processor,
encoder_image_processor=encoder_image_processor,
)
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)
# Verify processor_config.json contains both image processors
with open(os.path.join(tmp_dir, "processor_config.json")) as f:
processor_config = json.load(f)
self.assertIn("image_processor", processor_config)
self.assertIn("encoder_image_processor", processor_config)
self.assertNotIn("tokenizer", processor_config)
# Verify both image processors have the correct type key for instantiation
self.assertIn("image_processor_type", processor_config["image_processor"])
self.assertIn("image_processor_type", processor_config["encoder_image_processor"])
self.assertEqual(processor_config["image_processor"]["image_processor_type"], "SiglipImageProcessor")
self.assertEqual(processor_config["encoder_image_processor"]["image_processor_type"], "CLIPImageProcessor")
# Verify the sizes are different (to ensure they're separate configs)
self.assertEqual(processor_config["image_processor"]["size"], {"height": 224, "width": 224})
self.assertEqual(processor_config["encoder_image_processor"]["size"], {"height": 384, "width": 384})
# Reload the full processor and verify all attributes
loaded_processor = DualImageProcessorProcessor.from_pretrained(tmp_dir)
# Verify the processor has all expected attributes
self.assertTrue(hasattr(loaded_processor, "tokenizer"))
self.assertTrue(hasattr(loaded_processor, "image_processor"))
self.assertTrue(hasattr(loaded_processor, "encoder_image_processor"))
# Verify tokenizer loaded correctly
self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size)
# Verify image processors loaded correctly with their distinct sizes
self.assertEqual(loaded_processor.image_processor.size, {"height": 224, "width": 224})
self.assertEqual(loaded_processor.encoder_image_processor.size, {"height": 384, "width": 384})
# Verify they are different types
self.assertIsInstance(loaded_processor.image_processor, SiglipImageProcessor)
self.assertIsInstance(loaded_processor.encoder_image_processor, CLIPImageProcessor)
@is_staging_test
class ProcessorPushToHubTester(unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
@classmethod
def setUpClass(cls):
cls._token = TOKEN
def test_push_to_hub_via_save_pretrained(self):
with TemporaryHubRepo(token=self._token) as tmp_repo:
processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
# Push to hub via save_pretrained
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token)
new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo.repo_id)
for k, v in processor.feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_processor.feature_extractor, k))
self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
def test_push_to_hub_in_organization_via_save_pretrained(self):
with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
# Push to hub via save_pretrained
with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(
tmp_dir,
repo_id=tmp_repo.repo_id,
push_to_hub=True,
token=self._token,
)
new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo.repo_id)
for k, v in processor.feature_extractor.__dict__.items():
self.assertEqual(v, getattr(new_processor.feature_extractor, k))
self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
def test_push_to_hub_dynamic_processor(self):
with TemporaryHubRepo(token=self._token) as tmp_repo:
CustomFeatureExtractor.register_for_auto_class()
CustomTokenizer.register_for_auto_class()
CustomProcessor.register_for_auto_class()
feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
with tempfile.TemporaryDirectory() as tmp_dir:
vocab_file = os.path.join(tmp_dir, "vocab.txt")
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
tokenizer = CustomTokenizer(vocab_file)
processor = CustomProcessor(feature_extractor, tokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
snapshot_download(tmp_repo.repo_id, token=self._token)
processor.save_pretrained(tmp_dir)
# This has added the proper auto_map field to the feature extractor config
self.assertDictEqual(
processor.feature_extractor.auto_map,
{
"AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor",
"AutoProcessor": "custom_processing.CustomProcessor",
},
)
# This has added the proper auto_map field to the tokenizer config
with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f:
tokenizer_config = json.load(f)
self.assertDictEqual(
tokenizer_config["auto_map"],
{
"AutoTokenizer": ["custom_tokenization.CustomTokenizer", None],
"AutoProcessor": "custom_processing.CustomProcessor",
},
)
# The code has been copied from fixtures
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_feature_extraction.py")))
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_tokenization.py")))
self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_processing.py")))
upload_folder(repo_id=tmp_repo.repo_id, folder_path=tmp_dir, token=self._token)
new_processor = AutoProcessor.from_pretrained(tmp_repo.repo_id, trust_remote_code=True)
# Can't make an isinstance check because the new_processor is from the CustomProcessor class of a dynamic module
self.assertEqual(new_processor.__class__.__name__, "CustomProcessor")
def test_push_to_hub_with_chat_templates(self):
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer = LlamaTokenizer.from_pretrained(SAMPLE_VOCAB_LLAMA)
image_processor = SiglipImageProcessor()
chat_template = "default dummy template for testing purposes only"
processor = LlavaProcessor(
tokenizer=tokenizer, image_processor=image_processor, chat_template=chat_template
)
self.assertEqual(processor.chat_template, chat_template)
with TemporaryHubRepo(token=self._token) as tmp_repo:
processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, token=self._token, push_to_hub=True)
reloaded_processor = LlavaProcessor.from_pretrained(tmp_repo.repo_id)
self.assertEqual(processor.chat_template, reloaded_processor.chat_template)
# When we save as single files, tokenizers and processors share a chat template, which means
# the reloaded tokenizer should get the chat template as well
self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template)
with TemporaryHubRepo(token=self._token) as tmp_repo:
processor.chat_template = {"default": "a", "secondary": "b"}
processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, token=self._token, push_to_hub=True)
reloaded_processor = LlavaProcessor.from_pretrained(tmp_repo.repo_id)
self.assertEqual(processor.chat_template, reloaded_processor.chat_template)
# When we save as single files, tokenizers and processors share a chat template, which means
# the reloaded tokenizer should get the chat template as well
self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template)

View File

@@ -0,0 +1,810 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import sys
import tempfile
import unittest
from pathlib import Path
from unittest import mock
import pytest
import transformers
from transformers import (
AutoTokenizer,
BertConfig,
BertTokenizer,
BertTokenizerFast,
CTRLTokenizer,
GPT2Tokenizer,
HerbertTokenizer,
PreTrainedTokenizerFast,
PythonBackend,
Qwen2Tokenizer,
Qwen2TokenizerFast,
Qwen3MoeConfig,
RobertaTokenizer,
TokenizersBackend,
is_tokenizers_available,
logging,
)
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
from transformers.models.auto.tokenization_auto import (
REGISTERED_FAST_ALIASES,
REGISTERED_TOKENIZER_CLASSES,
TOKENIZER_MAPPING,
TOKENIZER_MAPPING_NAMES,
get_tokenizer_config,
tokenizer_class_from_name,
)
from transformers.models.roberta.configuration_roberta import RobertaConfig
from transformers.testing_utils import (
DUMMY_DIFF_TOKENIZER_IDENTIFIER,
DUMMY_UNKNOWN_IDENTIFIER,
SMALL_MODEL_IDENTIFIER,
CaptureLogger,
RequestCounter,
require_sentencepiece,
require_tokenizers,
slow,
)
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_tokenization import CustomTokenizer # noqa E402
if is_tokenizers_available():
from test_module.custom_tokenization_fast import CustomTokenizerFast
class AutoTokenizerTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
@slow
def test_tokenizer_from_pretrained(self):
for model_name in ("google-bert/bert-base-uncased", "google-bert/bert-base-cased"):
tokenizer = AutoTokenizer.from_pretrained(model_name)
self.assertIsNotNone(tokenizer)
self.assertIsInstance(tokenizer, (BertTokenizer))
self.assertGreater(len(tokenizer), 0)
for model_name in ["openai-community/gpt2", "openai-community/gpt2-medium"]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
self.assertIsNotNone(tokenizer)
self.assertIsInstance(tokenizer, (GPT2Tokenizer))
self.assertGreater(len(tokenizer), 0)
def test_tokenizer_from_pretrained_identifier(self):
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(tokenizer, (BertTokenizer))
self.assertEqual(tokenizer.vocab_size, 12)
def test_tokenizer_from_model_type(self):
tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
self.assertIsInstance(tokenizer, (RobertaTokenizer))
self.assertEqual(tokenizer.vocab_size, 20)
def test_tokenizer_from_tokenizer_class(self):
config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
self.assertIsInstance(config, RobertaConfig)
# Check that tokenizer_type ≠ model_type
tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
self.assertIsInstance(tokenizer, (BertTokenizer))
self.assertEqual(tokenizer.vocab_size, 12)
def test_tokenizer_from_type(self):
with tempfile.TemporaryDirectory() as tmp_dir:
shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))
tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert", use_fast=False)
self.assertIsInstance(tokenizer, BertTokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))
tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2", use_fast=False)
self.assertIsInstance(tokenizer, GPT2Tokenizer)
@require_tokenizers
def test_tokenizer_from_type_fast(self):
with tempfile.TemporaryDirectory() as tmp_dir:
shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))
tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert")
self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
with tempfile.TemporaryDirectory() as tmp_dir:
shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))
tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2")
self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
def test_tokenizer_from_type_incorrect_name(self):
with pytest.raises(ValueError):
AutoTokenizer.from_pretrained("./", tokenizer_type="xxx")
@require_tokenizers
def test_tokenizer_identifier_with_correct_config(self):
for tokenizer_class in [BertTokenizer, AutoTokenizer]:
tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
self.assertIsInstance(tokenizer, (BertTokenizer))
self.assertEqual(tokenizer.do_lower_case, False)
self.assertEqual(tokenizer.model_max_length, 512)
@require_tokenizers
def test_tokenizer_identifier_non_existent(self):
for tokenizer_class in [BertTokenizer, AutoTokenizer]:
with self.assertRaisesRegex(
EnvironmentError,
"julien-c/herlolip-not-exists is not a local folder and is not a valid model identifier",
):
_ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
def test_model_name_edge_cases_in_mappings(self):
# tests: https://github.com/huggingface/transformers/pull/13251
# 1. models with `-`, e.g. xlm-roberta -> xlm_roberta
# 2. models that don't remap 1-1 from model-name to model file, e.g., openai-gpt -> openai
tokenizers = TOKENIZER_MAPPING.values()
tokenizer_names = []
for tokenizer_entry in tokenizers:
candidates = tokenizer_entry if isinstance(tokenizer_entry, tuple) else (tokenizer_entry,)
for tokenizer_cls in candidates:
if tokenizer_cls is not None:
tokenizer_names.append(tokenizer_cls.__name__)
for tokenizer_name in tokenizer_names:
# must find the right class
tokenizer_class_from_name(tokenizer_name)
def test_tokenizer_mapping_names_use_single_entries(self):
# this is just to ensure tokenizer mapping names are correct and map to strings!
invalid_entries = [
model_name
for model_name, tokenizer_entry in TOKENIZER_MAPPING_NAMES.items()
if isinstance(tokenizer_entry, (tuple, list))
]
self.assertListEqual(
invalid_entries,
[],
msg=(
"TOKENIZER_MAPPING_NAMES should map model types to single tokenizer class names. "
f"Found invalid mappings for: {invalid_entries}"
),
)
@require_tokenizers
def test_from_pretrained_use_fast_toggle(self):
self.assertIsInstance(
AutoTokenizer.from_pretrained("google-bert/bert-base-cased", use_fast=False), BertTokenizer
)
self.assertIsInstance(AutoTokenizer.from_pretrained("google-bert/bert-base-cased"), BertTokenizerFast)
@require_tokenizers
@slow
def test_custom_tokenizer_from_hub(self):
tokenizer = AutoTokenizer.from_pretrained(
"openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True, revision="fd7f352fac0e06d0d818b23f98e3ec8c64267a57"
)
self.assertTrue(tokenizer.__class__.__module__.startswith("transformers_modules."))
@require_tokenizers
@slow
def test_remote_code_imports_removed_fast_submodule(self):
# BC v5: remote tokenizer code may import from a deprecated tokenization_*_fast
tokenizer = AutoTokenizer.from_pretrained(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
trust_remote_code=True,
revision="a9af15a6372d7d6b25e9fb07c2ccb9e1fe645644",
)
self.assertGreater(len(tokenizer("hello world")["input_ids"]), 0)
@require_tokenizers
def test_voxtral_tokenizer_converts_from_tekken(self):
# Test that voxtral tokenizer loads correctly when falling back to TokenizersBackend
# (i.e., when MistralCommonBackend is not available)
repo_id = "mistralai/Voxtral-Mini-3B-2507"
# Simulate the fallback path by temporarily changing the mapping for voxtral
# from MistralCommonBackend to TokenizersBackend
with mock.patch.dict(TOKENIZER_MAPPING_NAMES, {"voxtral": "TokenizersBackend"}):
tokenizer = AutoTokenizer.from_pretrained(repo_id)
self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
self.assertTrue(tokenizer.is_fast)
self.assertGreater(len(tokenizer("Voxtral")["input_ids"]), 0)
@require_tokenizers
def test_do_lower_case(self):
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", do_lower_case=False)
sample = "Hello, world. How are you?"
tokens = tokenizer.tokenize(sample)
self.assertEqual("[UNK]", tokens[0])
tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
tokens = tokenizer.tokenize(sample)
self.assertEqual("[UNK]", tokens[0])
@require_tokenizers
def test_PreTrainedTokenizerFast_from_pretrained(self):
tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
self.assertEqual(tokenizer.model_max_length, 512)
self.assertEqual(tokenizer.vocab_size, 30000)
self.assertEqual(tokenizer.unk_token, "[UNK]")
self.assertEqual(tokenizer.padding_side, "right")
self.assertEqual(tokenizer.truncation_side, "right")
def test_auto_tokenizer_from_local_folder(self):
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(tokenizer, (BertTokenizer))
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)
self.assertIsInstance(tokenizer2, tokenizer.__class__)
self.assertEqual(tokenizer2.vocab_size, 12)
def test_auto_tokenizer_from_local_folder_mistral_detection(self):
"""See #42374 and #45444 for reference, ensuring proper mistral detection on local tokenizers"""
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-235B-A22B-Thinking-2507")
config = Qwen3MoeConfig.from_pretrained("Qwen/Qwen3-235B-A22B-Thinking-2507")
self.assertIsInstance(tokenizer, (Qwen2Tokenizer, Qwen2TokenizerFast))
mistral_warning = (
"with an incorrect regex pattern: "
"https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84"
"#69121093e8b480e709447d5e"
)
logger = logging.get_logger("transformers.tokenization_utils_tokenizers")
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
config_path = os.path.join(tmp_dir, "config.json")
def _write_config(**overrides):
config_dict = config.to_diff_dict()
for key, value in overrides.items():
if value is None:
config_dict.pop(key, None)
else:
config_dict[key] = value
with open(config_path, "w", encoding="utf-8") as f:
json.dump(config_dict, f, indent=2, sort_keys=True)
# Case 1: Tokenizer with no config associated must not warn
with CaptureLogger(logger) as cl:
AutoTokenizer.from_pretrained(tmp_dir)
self.assertNotIn(mistral_warning, cl.out)
# Case 2: Non-mistral local config must not warn for any `transformers_version`
for saved_version in ("4.57.2", "4.57.3", "4.57.6", "5.0.1"):
_write_config(transformers_version=saved_version)
with CaptureLogger(logger) as cl:
tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)
self.assertNotIn(
mistral_warning,
cl.out,
msg=f"Unexpected mistral regex warning for non-mistral config (transformers_version={saved_version!r})",
)
# Case 3: Mistral-family local config saved by an affected transformers release
# must still warn, even up to 4.57.6
for saved_version in ("4.57.3", "4.57.6"):
_write_config(model_type="mistral", transformers_version=saved_version)
with CaptureLogger(logger) as cl:
AutoTokenizer.from_pretrained(tmp_dir)
self.assertIn(
mistral_warning,
cl.out,
msg=f"Missing mistral regex warning for mistral config (transformers_version={saved_version!r})",
)
# Case 4: Mistral-family local config saved by a fixed transformers release must not warn
_write_config(model_type="mistral", transformers_version="5.0.1")
with CaptureLogger(logger) as cl:
AutoTokenizer.from_pretrained(tmp_dir)
self.assertNotIn(mistral_warning, cl.out)
self.assertIsInstance(tokenizer2, tokenizer.__class__)
self.assertTrue(tokenizer2.vocab_size > 100_000)
def test_auto_tokenizer_from_mistral_patching(self):
"""See #43376, regression when kwarg is manually passed to patch the regex in mistral tokenizers"""
AutoTokenizer.from_pretrained(
"mistralai/Ministral-3-3B-Instruct-2512", fix_mistral_regex=True
) # should not error
@require_tokenizers
def test_auto_tokenizer_loads_bloom_repo_without_tokenizer_class(self):
tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-BloomForCausalLM")
self.assertIsInstance(tokenizer, TokenizersBackend)
self.assertTrue(tokenizer.is_fast)
@require_tokenizers
def test_auto_tokenizer_loads_sentencepiece_only_repo(self):
tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-mbart")
self.assertIsInstance(tokenizer, TokenizersBackend)
self.assertTrue(tokenizer.is_fast)
def test_auto_tokenizer_fast_no_slow(self):
tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
# There is no fast CTRL so this always gives us a slow tokenizer.
self.assertIsInstance(tokenizer, CTRLTokenizer)
def test_get_tokenizer_config(self):
# Check we can load the tokenizer config of an online model.
config = get_tokenizer_config("google-bert/bert-base-cased")
_ = config.pop("_commit_hash", None)
# If we ever update google-bert/bert-base-cased tokenizer config, this dict here will need to be updated.
self.assertEqual(config, {"do_lower_case": False, "model_max_length": 512})
# This model does not have a tokenizer_config so we get back an empty dict.
config = get_tokenizer_config(SMALL_MODEL_IDENTIFIER)
self.assertDictEqual(config, {})
# A tokenizer saved with `save_pretrained` always creates a tokenizer config.
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
config = get_tokenizer_config(tmp_dir)
# Check the class of the tokenizer was properly saved (note that it always saves the slow class).
self.assertEqual(config["tokenizer_class"], "BertTokenizer")
def test_new_tokenizer_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer)
tokenizer = CustomTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
self.assertIsInstance(new_tokenizer, TokenizersBackend)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
@require_tokenizers
def test_new_tokenizer_fast_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
# Can register in two steps (fast takes precedence)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizer)
AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast)
self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizerFast)
del TOKENIZER_MAPPING._extra_content[CustomConfig]
# Can register in one step
AutoTokenizer.register(
CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast
)
self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizerFast)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast)
# We pass through a bert tokenizer fast cause there is no converter slow to fast for our new toknizer
# and that model does not have a tokenizer.json
with tempfile.TemporaryDirectory() as tmp_dir:
bert_tokenizer = BertTokenizerFast.from_pretrained(SMALL_MODEL_IDENTIFIER)
bert_tokenizer.save_pretrained(tmp_dir)
tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir)
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False)
self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizerFast", None)
REGISTERED_FAST_ALIASES.pop("CustomTokenizer", None)
def test_from_pretrained_dynamic_tokenizer(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
)
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True)
self.assertTrue(tokenizer.special_attribute_present)
# Test the dynamic module is loaded only once.
reloaded_tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True
)
self.assertIs(tokenizer.__class__, reloaded_tokenizer.__class__)
# Test tokenizer can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertTrue(reloaded_tokenizer.special_attribute_present)
if is_tokenizers_available():
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast")
# Test we can also load the slow version
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
)
self.assertTrue(tokenizer.special_attribute_present)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
# Test tokenizer can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, trust_remote_code=True, use_fast=False)
self.assertTrue(
os.path.exists(os.path.join(tmp_dir, "tokenization.py"))
) # Assert we saved tokenizer code
self.assertEqual(reloaded_tokenizer._auto_class, "AutoTokenizer")
with open(os.path.join(tmp_dir, "tokenizer_config.json"), "r") as f:
tokenizer_config = json.load(f)
# Assert we're pointing at local code and not another remote repo
self.assertEqual(
tokenizer_config["auto_map"]["AutoTokenizer"],
["tokenization.NewTokenizer", "tokenization_fast.NewTokenizerFast"],
)
self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertTrue(reloaded_tokenizer.special_attribute_present)
else:
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
# Test the dynamic module is reloaded if we force it.
reloaded_tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, force_download=True
)
self.assertIsNot(tokenizer.__class__, reloaded_tokenizer.__class__)
self.assertTrue(reloaded_tokenizer.special_attribute_present)
@slow
def test_custom_tokenizer_init(self):
tokenizer = AutoTokenizer.from_pretrained(
"Qwen/Qwen-VL", trust_remote_code=True, revision="0547ed36a86561e2e42fecec8fd0c4f6953e33c4"
)
self.assertIsInstance(tokenizer, PythonBackend)
self.assertGreater(len(tokenizer.get_vocab()), 0)
@require_tokenizers
def test_from_pretrained_dynamic_tokenizer_conflict(self):
class NewTokenizer(BertTokenizer):
special_attribute_present = False
try:
AutoConfig.register("custom", CustomConfig)
AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
# If remote code is not set, the default is to use local
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertFalse(tokenizer.special_attribute_present)
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertFalse(tokenizer.special_attribute_present)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
self.assertFalse(tokenizer.special_attribute_present)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewTokenizer.__module__ = "transformers.models.custom.configuration_custom"
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
self.assertTrue(tokenizer.special_attribute_present)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True
)
self.assertTrue(tokenizer.special_attribute_present)
if is_tokenizers_available():
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
# Test we can also load the slow version
tokenizer = AutoTokenizer.from_pretrained(
"hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True, use_fast=False
)
self.assertTrue(tokenizer.special_attribute_present)
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
else:
self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
):
_ = AutoTokenizer.from_pretrained("bert-base")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
@unittest.skip("This test is failing on main") # TODO Matt/ydshieh, fix this test!
def test_cached_tokenizer_has_minimum_calls_to_head(self):
# Make sure we have cached the tokenizer.
_ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
with RequestCounter() as counter:
_ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
self.assertEqual(counter["GET"], 0)
self.assertEqual(counter["HEAD"], 1)
self.assertEqual(counter.total_calls, 1)
def test_init_tokenizer_with_trust(self):
nop_tokenizer_code = """
import transformers
class NopTokenizer(transformers.PreTrainedTokenizer):
def get_vocab(self):
return {}
"""
nop_config_code = """
from transformers import PreTrainedConfig
class NopConfig(PreTrainedConfig):
model_type = "test_unregistered_dynamic"
def __init__(self, **kwargs):
super().__init__(**kwargs)
"""
with tempfile.TemporaryDirectory() as tmp_dir:
fake_model_id = "hf-internal-testing/test_unregistered_dynamic"
fake_repo = os.path.join(tmp_dir, fake_model_id)
os.makedirs(fake_repo)
tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py")
with open(tokenizer_src_file, "w") as wfp:
wfp.write(nop_tokenizer_code)
model_config_src_file = os.path.join(fake_repo, "config.py")
with open(model_config_src_file, "w") as wfp:
wfp.write(nop_config_code)
config = {
"model_type": "test_unregistered_dynamic",
"auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"},
}
config_file = os.path.join(fake_repo, "config.json")
with open(config_file, "w") as wfp:
json.dump(config, wfp, indent=2)
tokenizer_config = {
"auto_map": {
"AutoTokenizer": [
f"{fake_model_id}--tokenizer.NopTokenizer",
None,
]
}
}
tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json")
with open(tokenizer_config_file, "w") as wfp:
json.dump(tokenizer_config, wfp, indent=2)
prev_dir = os.getcwd()
try:
# it looks like subdir= is broken in the from_pretrained also, so this is necessary
os.chdir(tmp_dir)
# this should work because we trust the code
_ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True)
try:
# this should fail because we don't trust and we're not at a terminal for interactive response
_ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=False)
self.fail("AutoTokenizer.from_pretrained with trust_remote_code=False should raise ValueException")
except ValueError:
pass
finally:
os.chdir(prev_dir)
def test_tokenization_class_priority(self):
from transformers import AutoProcessor
tok = AutoTokenizer.from_pretrained("mlx-community/MiniMax-M2.1-4bit")
self.assertTrue(tok.__class__ == TokenizersBackend)
tok = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
self.assertTrue(tok.__class__ == HerbertTokenizer)
with tempfile.TemporaryDirectory() as tmp_dir:
tok.save_pretrained(tmp_dir)
tok2 = AutoTokenizer.from_pretrained(tmp_dir)
self.assertTrue(tok2.__class__ == HerbertTokenizer)
tok = AutoProcessor.from_pretrained("mistralai/Ministral-3-8B-Instruct-2512-BF16").tokenizer
self.assertTrue(tok.__class__ == TokenizersBackend)
def test_custom_tokenizer_with_mismatched_tokenizer_class(self):
nop_tokenizer_code = """
import transformers
class NopTokenizer(transformers.PreTrainedTokenizer):
special_attribute_present = True
def get_vocab(self):
return {}
"""
nop_config_code = """
from transformers import PreTrainedConfig
class NopConfig(PreTrainedConfig):
model_type = "test_unregistered_dynamic"
def __init__(self, **kwargs):
super().__init__(**kwargs)
"""
with tempfile.TemporaryDirectory() as tmp_dir:
fake_model_id = "hf-internal-testing/test_unregistered_dynamic"
fake_repo = os.path.join(tmp_dir, fake_model_id)
os.makedirs(fake_repo)
tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py")
with open(tokenizer_src_file, "w") as wfp:
wfp.write(nop_tokenizer_code)
model_config_src_file = os.path.join(fake_repo, "config.py")
with open(model_config_src_file, "w") as wfp:
wfp.write(nop_config_code)
config = {
"model_type": "test_unregistered_dynamic",
"auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"},
}
config_file = os.path.join(fake_repo, "config.json")
with open(config_file, "w") as wfp:
json.dump(config, wfp, indent=2)
tokenizer_config = {
"tokenizer_class": "NopTokenizer",
"auto_map": {
"AutoTokenizer": [
f"{fake_model_id}--tokenizer.NopTokenizer",
None,
]
},
}
tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json")
with open(tokenizer_config_file, "w") as wfp:
json.dump(tokenizer_config, wfp, indent=2)
prev_dir = os.getcwd()
try:
os.chdir(tmp_dir)
tokenizer = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True)
self.assertEqual(tokenizer.__class__.__name__, "NopTokenizer")
self.assertTrue(tokenizer.special_attribute_present)
finally:
os.chdir(prev_dir)
@require_tokenizers
@require_sentencepiece
def test_mismatched_model_type_uses_config_tokenizer_class_with_sentencepiece(self):
tokenizer = AutoTokenizer.from_pretrained(
"facebook/nllb-200-distilled-600M",
revision="f8d333a098d19b4fd9a8b18f94170487ad3f821d",
)
self.assertEqual(tokenizer.__class__.__name__, "NllbTokenizer")
@require_tokenizers
def test_mismatched_model_type_uses_config_tokenizer_class_without_sentencepiece(self):
with mock.patch("transformers.models.auto.tokenization_auto.is_sentencepiece_available", return_value=False):
tokenizer = AutoTokenizer.from_pretrained(
"facebook/nllb-200-distilled-600M",
revision="f8d333a098d19b4fd9a8b18f94170487ad3f821d",
)
self.assertEqual(tokenizer.__class__.__name__, "NllbTokenizer")
@slow
@require_tokenizers
def test_deepseek_r1_tokenizer_preserves_spaces(self):
"""Regression: deepseek_v3 Hub config has wrong tokenizer_class='LlamaTokenizerFast'; must use TokenizersBackend."""
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
self.assertIsInstance(tokenizer, TokenizersBackend)
text = "hello world"
self.assertEqual(tokenizer.decode(tokenizer.encode(text)), text)
@slow
@require_tokenizers
def test_deepseek_r1_distill_qwen_uses_qwen2_tokenizer(self):
"""Regression: qwen2 model with wrong Hub tokenizer_class='LlamaTokenizerFast' must use Qwen2Tokenizer."""
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
self.assertIsInstance(tokenizer, Qwen2Tokenizer)
@require_tokenizers
@require_sentencepiece
def test_specialized_hub_tokenizer_class_overrides_mismatched_auto_mapping(self):
"""Hub's tokenizer_class wins when the auto-mapping has a different real class (e.g. m2m_100 → NllbTokenizer)."""
from transformers import NllbTokenizer
fake_config = mock.MagicMock()
fake_config.model_type = "m2m_100"
mock_tokenizer = mock.MagicMock(spec=NllbTokenizer)
with (
mock.patch(
"transformers.models.auto.tokenization_auto.AutoConfig.from_pretrained",
return_value=fake_config,
),
mock.patch(
"transformers.models.auto.tokenization_auto.get_tokenizer_config",
return_value={"tokenizer_class": "NllbTokenizer"},
),
mock.patch.object(NllbTokenizer, "from_pretrained", return_value=mock_tokenizer) as mock_nllb,
mock.patch.object(TokenizersBackend, "from_pretrained") as mock_tb,
):
result = AutoTokenizer.from_pretrained("fake/nllb-model")
mock_nllb.assert_called_once()
mock_tb.assert_not_called()
self.assertIs(result, mock_tokenizer)

View File

@@ -0,0 +1,248 @@
# Copyright 2025 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import sys
import tempfile
import unittest
from pathlib import Path
import transformers
from transformers import (
CONFIG_MAPPING,
VIDEO_PROCESSOR_MAPPING,
AutoConfig,
AutoVideoProcessor,
LlavaOnevisionConfig,
LlavaOnevisionVideoProcessor,
)
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torch
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
from test_module.custom_configuration import CustomConfig # noqa E402
from test_module.custom_video_processing import CustomVideoProcessor # noqa E402
@require_torch
class AutoVideoProcessorTest(unittest.TestCase):
def setUp(self):
transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0
def test_video_processor_from_model_shortcut(self):
config = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_video_processor_from_local_directory_from_key(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
config = AutoVideoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_video_processor_from_local_directory_from_preprocessor_key(self):
# Ensure we can load the image processor from the feature extractor config
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
config = AutoVideoProcessor.from_pretrained(tmpdirname)
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_video_processor_from_local_directory_from_config(self):
with tempfile.TemporaryDirectory() as tmpdirname:
model_config = LlavaOnevisionConfig()
# Create a dummy config file with image_processor_type
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
# remove video_processor_type to make sure config.json alone is enough to load image processor locally
config_dict = AutoVideoProcessor.from_pretrained(tmpdirname).to_dict()
config_dict.pop("video_processor_type")
config = LlavaOnevisionVideoProcessor(**config_dict)
# save in new folder
model_config.save_pretrained(tmpdirname)
config.save_pretrained(tmpdirname)
config = AutoVideoProcessor.from_pretrained(tmpdirname)
# make sure private variable is not incorrectly saved
dict_as_saved = json.loads(config.to_json_string())
self.assertTrue("_processor_class" not in dict_as_saved)
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_video_processor_from_local_file(self):
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
config = AutoVideoProcessor.from_pretrained(processor_tmpfile)
self.assertIsInstance(config, LlavaOnevisionVideoProcessor)
def test_repo_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"llava-hf/llava-doesnt-exist is not a local folder and is not a valid model identifier",
):
_ = AutoVideoProcessor.from_pretrained("llava-hf/llava-doesnt-exist")
def test_revision_not_found(self):
with self.assertRaisesRegex(
EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
):
_ = AutoVideoProcessor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_video_processor_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"Can't load video processor for 'hf-internal-testing/config-no-model'.",
):
_ = AutoVideoProcessor.from_pretrained("hf-internal-testing/config-no-model")
def test_from_pretrained_dynamic_video_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
video_processor = AutoVideoProcessor.from_pretrained("hf-internal-testing/test_dynamic_video_processor")
# If remote code is disabled, we can't load this config.
with self.assertRaises(ValueError):
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=False
)
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
)
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
# Test the dynamic module is loaded only once.
reloaded_video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
)
self.assertIs(video_processor.__class__, reloaded_video_processor.__class__)
# Test image processor can be reloaded.
with tempfile.TemporaryDirectory() as tmp_dir:
video_processor.save_pretrained(tmp_dir)
reloaded_video_processor = AutoVideoProcessor.from_pretrained(tmp_dir, trust_remote_code=True)
self.assertEqual(reloaded_video_processor.__class__.__name__, "NewVideoProcessor")
def test_new_video_processor_registration(self):
try:
AutoConfig.register("custom", CustomConfig)
AutoVideoProcessor.register(CustomConfig, CustomVideoProcessor)
# Trying to register something existing in the Transformers library will raise an error
with self.assertRaises(ValueError):
AutoVideoProcessor.register(LlavaOnevisionConfig, LlavaOnevisionVideoProcessor)
with tempfile.TemporaryDirectory() as tmpdirname:
processor_tmpfile = Path(tmpdirname) / "video_preprocessor_config.json"
config_tmpfile = Path(tmpdirname) / "config.json"
json.dump(
{
"video_processor_type": "LlavaOnevisionVideoProcessor",
"processor_class": "LlavaOnevisionProcessor",
},
open(processor_tmpfile, "w"),
)
json.dump({"model_type": "llava_onevision"}, open(config_tmpfile, "w"))
video_processor = CustomVideoProcessor.from_pretrained(tmpdirname)
# Now that the config is registered, it can be used as any other config with the auto-API
with tempfile.TemporaryDirectory() as tmp_dir:
video_processor.save_pretrained(tmp_dir)
new_video_processor = AutoVideoProcessor.from_pretrained(tmp_dir)
self.assertIsInstance(new_video_processor, CustomVideoProcessor)
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in VIDEO_PROCESSOR_MAPPING._extra_content:
del VIDEO_PROCESSOR_MAPPING._extra_content[CustomConfig]
def test_from_pretrained_dynamic_video_processor_conflict(self):
class NewVideoProcessor(LlavaOnevisionVideoProcessor):
is_local = True
try:
AutoConfig.register("custom", CustomConfig)
AutoVideoProcessor.register(CustomConfig, NewVideoProcessor)
# If remote code is not set, the default is to use local
video_processor = AutoVideoProcessor.from_pretrained("hf-internal-testing/test_dynamic_video_processor")
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
self.assertTrue(video_processor.is_local)
# If remote code is disabled, we load the local one.
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=False
)
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
self.assertTrue(video_processor.is_local)
# If remote code is enabled but the user explicitly registered the local one, we load the local one.
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
)
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
self.assertTrue(video_processor.is_local)
# If remote code is enabled but local code originated from transformers, we load the remote one.
NewVideoProcessor.__module__ = "transformers.models.custom.configuration_custom"
video_processor = AutoVideoProcessor.from_pretrained(
"hf-internal-testing/test_dynamic_video_processor", trust_remote_code=True
)
self.assertEqual(video_processor.__class__.__name__, "NewVideoProcessor")
self.assertTrue(not hasattr(video_processor, "is_local"))
finally:
if "custom" in CONFIG_MAPPING._extra_content:
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in VIDEO_PROCESSOR_MAPPING._extra_content:
del VIDEO_PROCESSOR_MAPPING._extra_content[CustomConfig]