Files
transformers/utils/check_auto.py
陈赣 06f1fd69a6
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
first commit
2026-06-05 16:53:03 +08:00

310 lines
13 KiB
Python

# Copyright 2026 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import difflib
import glob
import os
import subprocess
import tempfile
from collections import Counter, OrderedDict
from typing import Any
from sort_auto_mappings import sort_auto_mapping
from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES as COMPLETE_CONFIG_MAPPING_NAMES
from transformers.models.auto.image_processing_auto import MISSING_IMAGE_PROCESSOR_MAPPING_NAMES
from transformers.models.auto.video_processing_auto import MISSING_VIDEO_PROCESSOR_MAPPING_NAMES
CHECKER_CONFIG = {
"name": "auto_mappings",
"label": "Generate auto mappings",
"cache_globs": [],
"check_args": [],
"fix_args": ["--fix_and_overwrite"],
}
AUTO_GENERATED_HADER = """# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from existing config files and their `model_type`s. Do NOT edit this file
# manually as any edits will be overwritten by auto-generation of the file. If any change should be done,
# please add the correct `cls.model_type` in your config class and run `python utils/check_auto.py --fix_and_overwrite`.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# Some keys are duplicated due to incorrect naming at model shipping and BC
IGNORE_DUPLICATE_CONFIG = ["GPT2Config", "EvollaConfig", "MLCDVisionConfig"]
def build_config_mapping_names() -> tuple[dict, dict]:
model_type_map = OrderedDict()
special_mappings = OrderedDict()
# Track which model_types were resolved by a "natural" match (model_type == module_name)
# so a later non-natural match (e.g. MaskFormerDetrConfig with model_type="detr" inside
# models/maskformer/) does not silently overwrite the canonical class.
natural_types: set[str] = set()
# `glob.glob` is filesystem-order dependent — sort to make the output deterministic.
all_files = sorted(glob.glob("src/transformers/models/**/configuration_*.py", recursive=True))
for config_path in all_files:
module_name = config_path.split("/")[-2]
with open(config_path, "r") as f:
content = f.read()
tree = ast.parse(content)
for node in tree.body:
if isinstance(node, ast.ClassDef) and any(
base.id == "PreTrainedConfig" for base in node.bases if isinstance(base, ast.Name)
):
config_cls_name = node.name
model_type = None
for stmt in node.body:
if isinstance(stmt, ast.Assign):
if model_types := [
stmt.value.value
for target in stmt.targets
if isinstance(target, ast.Name) and target.id == "model_type"
]:
model_type = model_types[0]
break
elif isinstance(stmt, ast.AnnAssign):
if stmt.target.id == "model_type":
model_type = stmt.value.value
break
if not model_type:
continue
is_natural = model_type == module_name
# If we already recorded a natural match for this model_type, don't let a
# non-natural one overwrite it — the natural class is the canonical owner.
if model_type in natural_types and not is_natural:
continue
model_type_map[model_type] = config_cls_name
if is_natural:
natural_types.add(model_type)
special_mappings.pop(model_type, None)
else:
special_mappings[model_type] = module_name
return model_type_map, special_mappings
def build_image_processor_mapping(
config_mapping: dict[str, str],
) -> OrderedDict[str, dict[str, str | None]]:
processor_mapping = OrderedDict()
for model_type in config_mapping:
module = model_type.replace("-", "_")
fast_processor_name = slow_processor_name = None
if os.path.exists(f"src/transformers/models/{module}/image_processing_pil_{module}.py"):
with open(f"src/transformers/models/{module}/image_processing_pil_{module}.py", "r") as f:
content = f.read()
tree = ast.parse(content)
for node in tree.body:
if isinstance(node, ast.ClassDef) and any(
base.id == "PilBackend" for base in node.bases if isinstance(base, ast.Name)
):
slow_processor_name = node.name
if os.path.exists(f"src/transformers/models/{module}/image_processing_{module}.py"):
with open(f"src/transformers/models/{module}/image_processing_{module}.py", "r") as f:
content = f.read()
tree = ast.parse(content)
for node in tree.body:
if isinstance(node, ast.ClassDef) and any(
base.id == "TorchvisionBackend" for base in node.bases if isinstance(base, ast.Name)
):
fast_processor_name = node.name
if slow_processor_name is not None or fast_processor_name is not None:
processor_mapping[model_type] = {
**({"pil": slow_processor_name} if slow_processor_name else {}),
**({"torchvision": fast_processor_name} if fast_processor_name else {}),
}
return processor_mapping
def build_video_processor_mapping(
config_mapping: dict[str, str],
) -> OrderedDict[str, dict[str, str | None]]:
processor_mapping = OrderedDict()
for model_type in config_mapping:
module = model_type.replace("-", "_")
video_processor_name = None
if os.path.exists(f"src/transformers/models/{module}/video_processing_{module}.py"):
with open(f"src/transformers/models/{module}/video_processing_{module}.py", "r") as f:
content = f.read()
tree = ast.parse(content)
for node in tree.body:
if isinstance(node, ast.ClassDef) and any(
base.id == "BaseVideoProcessor" for base in node.bases if isinstance(base, ast.Name)
):
video_processor_name = node.name
if video_processor_name is not None:
processor_mapping[model_type] = video_processor_name
return processor_mapping
def run_ruff_and_sort(file: str):
"""Run `ruff` linter and formatter on `file`, as in `make style` and sort the mappings order"""
sort_auto_mapping(file, overwrite=True)
subprocess.run(["ruff", "check", file, "--fix"], stdout=subprocess.DEVNULL)
subprocess.run(["ruff", "format", file], stdout=subprocess.DEVNULL)
def format_dict_value(v):
if isinstance(v, str):
return f'"{v}"'
elif isinstance(v, dict):
items = ", ".join(f'"{k}": {format_dict_value(val)}' for k, val in v.items())
return "{" + items + "}"
elif isinstance(v, list):
items = ", ".join(format_dict_value(x) for x in v)
return "[" + items + "]"
else:
return repr(v)
def format_ordered_dict(name: str, data: OrderedDict):
lines = []
lines.append(f"{name} = OrderedDict(")
lines.append(f"{' ' * 4}[")
for k, v in data.items():
lines.append(f'{" " * 8}("{k}", {format_dict_value(v)}),')
lines.append(f"{' ' * 4}]")
lines.append(")\n\n")
return "\n".join(lines)
def check_duplicates(mapping_for_special_models: dict[str, Any], auto_mapping: dict[str, Any]):
if intersections := (set(mapping_for_special_models.keys()) & set(auto_mapping.keys())):
raise ValueError(
"You have manually duplicated a model-type that is present in `auto_mappings.py`. "
f"Please, delete the entries for {intersections} if they are identical to auto-generated dict, "
"or use consistent naming across model files so that the names match."
)
def main(overwrite: bool):
filename = "src/transformers/models/auto/auto_mappings.py"
# 1. Read existing file content if available
old_content = ""
if os.path.exists(filename):
old_content = open(filename, "r").read()
# 2. Generate new config mapping dicts by parsing all model-config classes
config_mapping, special_mapping = build_config_mapping_names()
image_processor_mapping = build_image_processor_mapping(config_mapping=config_mapping)
video_processor_mapping = build_video_processor_mapping(config_mapping=config_mapping)
# Make sure users aren't duplicating the same keys manually
check_duplicates(MISSING_IMAGE_PROCESSOR_MAPPING_NAMES, image_processor_mapping)
check_duplicates(MISSING_VIDEO_PROCESSOR_MAPPING_NAMES, video_processor_mapping)
# The config mapping has to be one-to-one for correct `AutoConfig.from_pretrained()` because `LazyMapping`
# reverts keys/values and creates a dict from it. Duplicate values will be overwritten by whatever comes at last
duplicate_keys = [n for n, c in Counter(COMPLETE_CONFIG_MAPPING_NAMES.keys()).items() if c > 1]
if duplicate_keys:
raise ValueError(
f"Keys in `CONFIG_MAPPING_NAMES` contain duplicates = {duplicate_keys}. "
"The mapping has to be one-to-one to ensure correct `AutoConfig` functionality!"
)
duplicate_values = [
n
for n, c in Counter(COMPLETE_CONFIG_MAPPING_NAMES.values()).items()
if c > 1 and n not in IGNORE_DUPLICATE_CONFIG
]
if duplicate_values:
raise ValueError(
f"Values in `CONFIG_MAPPING_NAMES` contain duplicates = {duplicate_values}. "
"The mapping has to be one-to-one to ensure correct `AutoConfig` functionality!"
)
new_mappings = {
"CONFIG_MAPPING_NAMES": config_mapping,
"SPECIAL_MODEL_TYPE_TO_MODULE_NAME": special_mapping,
"IMAGE_PROCESSOR_MAPPING_NAMES": image_processor_mapping,
"VIDEO_PROCESSOR_MAPPING_NAMES": video_processor_mapping,
}
new_content = AUTO_GENERATED_HADER + "\nfrom collections import OrderedDict\n\n"
for k, v in new_mappings.items():
new_content += format_ordered_dict(name=k, data=v)
# 3. If the new auto-generate content is different, overwrite it
# Dirty hack to sort and apply ruff to the file content, for easier matching
with tempfile.TemporaryDirectory() as temp_folder:
temp_filename = os.path.join(temp_folder, "temp.py")
with open(temp_filename, "w") as temp_file:
temp_file.write(new_content)
run_ruff_and_sort(temp_filename)
new_content = open(temp_filename, "r").read()
if old_content != new_content:
if not overwrite:
diff = "".join(
difflib.unified_diff(
old_content.splitlines(keepends=True),
new_content.splitlines(keepends=True),
fromfile=f"{filename} (on disk)",
tofile=f"{filename} (regenerated)",
n=3,
)
)
raise Exception(
"Generated auto-mapping is not consistent with the contents of `models/auto/auto_mappings.py`.\n"
"Run `make fix-repo` or `python utils/check_auto.py --fix_and_overwrite` to fix them.\n\n"
f"Diff (on disk → regenerated):\n{diff}"
)
else:
with open(filename, "w") as f:
f.write(new_content)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
args = parser.parse_args()
main(overwrite=args.fix_and_overwrite)