transformers/utils/check_auto.py

# Copyright 2026 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import ast
import difflib
import glob
import os
import subprocess
import tempfile
from collections import Counter, OrderedDict
from typing import Any

from sort_auto_mappings import sort_auto_mapping

from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES as COMPLETE_CONFIG_MAPPING_NAMES
from transformers.models.auto.image_processing_auto import MISSING_IMAGE_PROCESSOR_MAPPING_NAMES
from transformers.models.auto.video_processing_auto import MISSING_VIDEO_PROCESSOR_MAPPING_NAMES


CHECKER_CONFIG = {
    "name": "auto_mappings",
    "label": "Generate auto mappings",
    "cache_globs": [],
    "check_args": [],
    "fix_args": ["--fix_and_overwrite"],
}

AUTO_GENERATED_HADER = """#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#             This file was automatically generated from existing config files and their `model_type`s. Do NOT edit this file
#               manually as any edits will be overwritten by auto-generation of the file. If any change should be done,
#          please add the correct `cls.model_type` in your config class and run `python utils/check_auto.py --fix_and_overwrite`.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""

# Some keys are duplicated due to incorrect naming at model shipping and BC
IGNORE_DUPLICATE_CONFIG = ["GPT2Config", "EvollaConfig", "MLCDVisionConfig"]


def build_config_mapping_names() -> tuple[dict, dict]:
    model_type_map = OrderedDict()
    special_mappings = OrderedDict()
    # Track which model_types were resolved by a "natural" match (model_type == module_name)
    # so a later non-natural match (e.g. MaskFormerDetrConfig with model_type="detr" inside
    # models/maskformer/) does not silently overwrite the canonical class.
    natural_types: set[str] = set()

    # `glob.glob` is filesystem-order dependent — sort to make the output deterministic.
    all_files = sorted(glob.glob("src/transformers/models/**/configuration_*.py", recursive=True))
    for config_path in all_files:
        module_name = config_path.split("/")[-2]
        with open(config_path, "r") as f:
            content = f.read()

        tree = ast.parse(content)
        for node in tree.body:
            if isinstance(node, ast.ClassDef) and any(
                base.id == "PreTrainedConfig" for base in node.bases if isinstance(base, ast.Name)
            ):
                config_cls_name = node.name
                model_type = None
                for stmt in node.body:
                    if isinstance(stmt, ast.Assign):
                        if model_types := [
                            stmt.value.value
                            for target in stmt.targets
                            if isinstance(target, ast.Name) and target.id == "model_type"
                        ]:
                            model_type = model_types[0]
                            break
                    elif isinstance(stmt, ast.AnnAssign):
                        if stmt.target.id == "model_type":
                            model_type = stmt.value.value
                            break

                if not model_type:
                    continue

                is_natural = model_type == module_name
                # If we already recorded a natural match for this model_type, don't let a
                # non-natural one overwrite it — the natural class is the canonical owner.
                if model_type in natural_types and not is_natural:
                    continue

                model_type_map[model_type] = config_cls_name
                if is_natural:
                    natural_types.add(model_type)
                    special_mappings.pop(model_type, None)
                else:
                    special_mappings[model_type] = module_name

    return model_type_map, special_mappings


def build_image_processor_mapping(
    config_mapping: dict[str, str],
) -> OrderedDict[str, dict[str, str | None]]:
    processor_mapping = OrderedDict()
    for model_type in config_mapping:
        module = model_type.replace("-", "_")
        fast_processor_name = slow_processor_name = None
        if os.path.exists(f"src/transformers/models/{module}/image_processing_pil_{module}.py"):
            with open(f"src/transformers/models/{module}/image_processing_pil_{module}.py", "r") as f:
                content = f.read()

            tree = ast.parse(content)
            for node in tree.body:
                if isinstance(node, ast.ClassDef) and any(
                    base.id == "PilBackend" for base in node.bases if isinstance(base, ast.Name)
                ):
                    slow_processor_name = node.name

        if os.path.exists(f"src/transformers/models/{module}/image_processing_{module}.py"):
            with open(f"src/transformers/models/{module}/image_processing_{module}.py", "r") as f:
                content = f.read()

            tree = ast.parse(content)
            for node in tree.body:
                if isinstance(node, ast.ClassDef) and any(
                    base.id == "TorchvisionBackend" for base in node.bases if isinstance(base, ast.Name)
                ):
                    fast_processor_name = node.name

        if slow_processor_name is not None or fast_processor_name is not None:
            processor_mapping[model_type] = {
                **({"pil": slow_processor_name} if slow_processor_name else {}),
                **({"torchvision": fast_processor_name} if fast_processor_name else {}),
            }

    return processor_mapping


def build_video_processor_mapping(
    config_mapping: dict[str, str],
) -> OrderedDict[str, dict[str, str | None]]:
    processor_mapping = OrderedDict()
    for model_type in config_mapping:
        module = model_type.replace("-", "_")
        video_processor_name = None

        if os.path.exists(f"src/transformers/models/{module}/video_processing_{module}.py"):
            with open(f"src/transformers/models/{module}/video_processing_{module}.py", "r") as f:
                content = f.read()

            tree = ast.parse(content)
            for node in tree.body:
                if isinstance(node, ast.ClassDef) and any(
                    base.id == "BaseVideoProcessor" for base in node.bases if isinstance(base, ast.Name)
                ):
                    video_processor_name = node.name

        if video_processor_name is not None:
            processor_mapping[model_type] = video_processor_name

    return processor_mapping


def run_ruff_and_sort(file: str):
    """Run `ruff` linter and formatter on `file`, as in `make style` and sort the mappings order"""
    sort_auto_mapping(file, overwrite=True)
    subprocess.run(["ruff", "check", file, "--fix"], stdout=subprocess.DEVNULL)
    subprocess.run(["ruff", "format", file], stdout=subprocess.DEVNULL)


def format_dict_value(v):
    if isinstance(v, str):
        return f'"{v}"'
    elif isinstance(v, dict):
        items = ", ".join(f'"{k}": {format_dict_value(val)}' for k, val in v.items())
        return "{" + items + "}"
    elif isinstance(v, list):
        items = ", ".join(format_dict_value(x) for x in v)
        return "[" + items + "]"
    else:
        return repr(v)


def format_ordered_dict(name: str, data: OrderedDict):
    lines = []

    lines.append(f"{name} = OrderedDict(")
    lines.append(f"{' ' * 4}[")

    for k, v in data.items():
        lines.append(f'{" " * 8}("{k}", {format_dict_value(v)}),')

    lines.append(f"{' ' * 4}]")
    lines.append(")\n\n")

    return "\n".join(lines)


def check_duplicates(mapping_for_special_models: dict[str, Any], auto_mapping: dict[str, Any]):
    if intersections := (set(mapping_for_special_models.keys()) & set(auto_mapping.keys())):
        raise ValueError(
            "You have manually duplicated a model-type that is present in `auto_mappings.py`. "
            f"Please, delete the entries for {intersections} if they are identical to auto-generated dict, "
            "or use consistent naming across model files so that the names match."
        )


def main(overwrite: bool):
    filename = "src/transformers/models/auto/auto_mappings.py"

    # 1. Read existing file content if available
    old_content = ""
    if os.path.exists(filename):
        old_content = open(filename, "r").read()

    # 2. Generate new config mapping dicts by parsing all model-config classes
    config_mapping, special_mapping = build_config_mapping_names()
    image_processor_mapping = build_image_processor_mapping(config_mapping=config_mapping)
    video_processor_mapping = build_video_processor_mapping(config_mapping=config_mapping)

    # Make sure users aren't duplicating the same keys manually
    check_duplicates(MISSING_IMAGE_PROCESSOR_MAPPING_NAMES, image_processor_mapping)
    check_duplicates(MISSING_VIDEO_PROCESSOR_MAPPING_NAMES, video_processor_mapping)

    # The config mapping has to be one-to-one for correct `AutoConfig.from_pretrained()` because `LazyMapping`
    # reverts keys/values and creates a dict from it. Duplicate values will be overwritten by whatever comes at last
    duplicate_keys = [n for n, c in Counter(COMPLETE_CONFIG_MAPPING_NAMES.keys()).items() if c > 1]
    if duplicate_keys:
        raise ValueError(
            f"Keys in `CONFIG_MAPPING_NAMES` contain duplicates = {duplicate_keys}. "
            "The mapping has to be one-to-one to ensure correct `AutoConfig` functionality!"
        )

    duplicate_values = [
        n
        for n, c in Counter(COMPLETE_CONFIG_MAPPING_NAMES.values()).items()
        if c > 1 and n not in IGNORE_DUPLICATE_CONFIG
    ]
    if duplicate_values:
        raise ValueError(
            f"Values in `CONFIG_MAPPING_NAMES` contain duplicates = {duplicate_values}. "
            "The mapping has to be one-to-one to ensure correct `AutoConfig` functionality!"
        )

    new_mappings = {
        "CONFIG_MAPPING_NAMES": config_mapping,
        "SPECIAL_MODEL_TYPE_TO_MODULE_NAME": special_mapping,
        "IMAGE_PROCESSOR_MAPPING_NAMES": image_processor_mapping,
        "VIDEO_PROCESSOR_MAPPING_NAMES": video_processor_mapping,
    }
    new_content = AUTO_GENERATED_HADER + "\nfrom collections import OrderedDict\n\n"
    for k, v in new_mappings.items():
        new_content += format_ordered_dict(name=k, data=v)

    # 3. If the new auto-generate content is different, overwrite it
    # Dirty hack to sort and apply ruff to the file content, for easier matching
    with tempfile.TemporaryDirectory() as temp_folder:
        temp_filename = os.path.join(temp_folder, "temp.py")
        with open(temp_filename, "w") as temp_file:
            temp_file.write(new_content)

        run_ruff_and_sort(temp_filename)
        new_content = open(temp_filename, "r").read()

    if old_content != new_content:
        if not overwrite:
            diff = "".join(
                difflib.unified_diff(
                    old_content.splitlines(keepends=True),
                    new_content.splitlines(keepends=True),
                    fromfile=f"{filename} (on disk)",
                    tofile=f"{filename} (regenerated)",
                    n=3,
                )
            )
            raise Exception(
                "Generated auto-mapping is not consistent with the contents of `models/auto/auto_mappings.py`.\n"
                "Run `make fix-repo` or `python utils/check_auto.py --fix_and_overwrite` to fix them.\n\n"
                f"Diff (on disk → regenerated):\n{diff}"
            )
        else:
            with open(filename, "w") as f:
                f.write(new_content)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
    args = parser.parse_args()
    main(overwrite=args.fix_and_overwrite)