first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@@ -0,0 +1,284 @@
+import argparse
+import difflib
+import glob
+import logging
+import multiprocessing
+import os
+import shutil
+import subprocess
+from functools import partial
+
+from create_dependency_mapping import find_priority_list
+
+# Console for rich printing
+from modular_model_converter import convert_modular_file, run_ruff
+from rich.console import Console
+from rich.syntax import Syntax
+
+
+CHECKER_CONFIG = {
+    "name": "modular_conversion",
+    "label": "Modular file conversions",
+    # Globs the modular sources; also reads generated modeling_*.py at runtime for diffing.
+    "cache_globs": ["src/transformers/models/**/modular_*.py", "src/transformers/models/**/modeling_*.py"],
+    "check_args": [],
+    "fix_args": ["--fix_and_overwrite"],
+}
+
+logging.basicConfig()
+logging.getLogger().setLevel(logging.ERROR)
+console = Console()
+
+BACKUP_EXT = ".modular_backup"
+
+
+def process_file(
+    modular_file_path,
+    generated_modeling_content,
+    file_type="modeling_",
+    show_diff=True,
+):
+    file_name_prefix = file_type.split(".*")[0]
+    file_name_suffix = file_type.split(".*")[-1] if ".*" in file_type else ""
+    file_path = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(".py", f"{file_name_suffix}.py")
+    # Read the actual modeling file
+    with open(file_path, "r", encoding="utf-8") as modeling_file:
+        content = modeling_file.read()
+    diff = difflib.unified_diff(
+        generated_modeling_content[file_type].splitlines(),
+        content.splitlines(),
+        fromfile=f"{file_path}_generated",
+        tofile=f"{file_path}",
+        lineterm="",
+    )
+    diff_list = list(diff)
+    # Check for differences
+    if diff_list:
+        # first save the copy of the original file, to be able to restore it later
+        shutil.copy(file_path, file_path + BACKUP_EXT)
+        # we always save the generated content, to be able to update dependant files
+        with open(file_path, "w", encoding="utf-8", newline="\n") as modeling_file:
+            modeling_file.write(generated_modeling_content[file_type])
+        if not show_diff:
+            console.print(f"[bold blue]Overwritten {file_path} with the generated content.[/bold blue]")
+        if show_diff:
+            console.print(f"\n[bold red]Differences found between the generated code and {file_path}:[/bold red]\n")
+            diff_text = "\n".join(diff_list)
+            syntax = Syntax(diff_text, "diff", theme="ansi_dark", line_numbers=True)
+            console.print(syntax)
+        return 1
+    else:
+        return 0
+
+
+def convert_and_run_ruff(modular_file_path: str) -> dict[str, str]:
+    """From a modular file, convert it and return all the contents of the file as string.
+    We need this function, because `ruff` needs the final filename to apply all rules correctly, so to get the
+    output as a string, we need to save a temporary file with similar name, run ruff, and re-read the temporary file"""
+    # Generate the expected modeling content
+    generated_modeling_content = convert_modular_file(modular_file_path)
+    # Temporary save the files with similar names to run `ruff` correctly, then re-read the result after linting/formatting
+    for file_type in generated_modeling_content:
+        file_name_prefix = file_type.split(".*")[0]
+        file_name_suffix = file_type.split(".*")[-1] if ".*" in file_type else ""
+        temp_file_name = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(
+            ".py", f"_temp_pattern__{file_name_suffix}.py"
+        )
+        # Write the file only temporarily
+        with open(temp_file_name, "w") as f:
+            f.write(generated_modeling_content[file_type])
+        # Run ruff on the new file (with similar name pattern as the original one)
+        run_ruff(temp_file_name)
+        with open(temp_file_name, "r") as f:
+            generated_modeling_content[file_type] = f.read()
+        # delete file
+        os.remove(temp_file_name)
+
+    return generated_modeling_content
+
+
+def compare_files(modular_file_path, show_diff=True):
+    # Generate the expected modeling content
+    generated_modeling_content = convert_and_run_ruff(modular_file_path)
+    diff = 0
+    for file_type in generated_modeling_content:
+        diff += process_file(modular_file_path, generated_modeling_content, file_type, show_diff)
+    return diff
+
+
+# Changes to any of these files can alter the generated output for every modular model,
+# so touching them must force a full re-check (see `converter_changed_in_diff`).
+CONVERTER_FILES = {
+    "utils/modular_model_converter.py",
+    "utils/create_dependency_mapping.py",
+}
+
+
+def _get_modified_files():
+    fork_point_sha = subprocess.check_output("git merge-base main HEAD".split()).decode("utf-8")
+    return (
+        subprocess.check_output(f"git diff --diff-filter=d --name-only {fork_point_sha}".split())
+        .decode("utf-8")
+        .split()
+    )
+
+
+def get_models_in_diff():
+    """
+    Finds all models that have been modified in the diff.
+
+    Returns:
+        A set containing the names of the models that have been modified (e.g. {'llama', 'whisper'}).
+    """
+    modified_files = _get_modified_files()
+
+    # Matches both modelling files and tests
+    relevant_modified_files = [x for x in modified_files if "/models/" in x and x.endswith(".py")]
+    model_names = set()
+    for file_path in relevant_modified_files:
+        model_name = file_path.split("/")[-2]
+        model_names.add(model_name)
+    return model_names
+
+
+def converter_changed_in_diff():
+    """Whether the diff touches a file that can change conversion output for every model."""
+    return any(f in CONVERTER_FILES for f in _get_modified_files())
+
+
+def guaranteed_no_diff(modular_file_path, dependencies, models_in_diff):
+    """
+    Returns whether it is guaranteed to have no differences between the modular file and the modeling file.
+
+    Model is in the diff -> not guaranteed to have no differences
+    Dependency is in the diff -> not guaranteed to have no differences
+    Otherwise -> guaranteed to have no differences
+
+    Args:
+        modular_file_path: The path to the modular file.
+        dependencies: A dictionary containing the dependencies of each modular file.
+        models_in_diff: A set containing the names of the models that have been modified.
+
+    Returns:
+        A boolean indicating whether the model (code and tests) is guaranteed to have no differences.
+    """
+    model_name = modular_file_path.rsplit("modular_", 1)[1].replace(".py", "")
+    if model_name in models_in_diff:
+        return False
+    for dep in dependencies[modular_file_path]:
+        # two possible patterns: `transformers.models.model_name.(...)` or `model_name.(...)`
+        dependency_model_name = dep.split(".")[-2]
+        if dependency_model_name in models_in_diff:
+            return False
+    return True
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compare modular_xxx.py files with modeling_xxx.py files.")
+    parser.add_argument(
+        "--files", default=["all"], type=str, nargs="+", help="List of modular_xxx.py files to compare."
+    )
+    parser.add_argument(
+        "--fix_and_overwrite", action="store_true", help="Overwrite the modeling_xxx.py file if differences are found."
+    )
+    parser.add_argument("--check_all", action="store_true", help="Check all files, not just the ones in the diff.")
+    parser.add_argument(
+        "--num_workers",
+        default=-1,
+        type=int,
+        help="The number of workers to run. Default is -1, which means the number of CPU cores.",
+    )
+    args = parser.parse_args()
+    if args.files == ["all"]:
+        args.files = glob.glob("src/transformers/models/**/modular_*.py", recursive=True)
+
+    if args.num_workers == -1:
+        args.num_workers = multiprocessing.cpu_count()
+
+    # Assuming there is a topological sort on the dependency mapping: if the file being checked and its dependencies
+    # are not in the diff, then there it is guaranteed to have no differences. If no models are in the diff, then this
+    # script will do nothing.
+    current_branch = subprocess.check_output(["git", "branch", "--show-current"], text=True).strip()
+    if current_branch == "main":
+        console.print(
+            "[bold red]You are developing on the main branch. We cannot identify the list of changed files and will have to check all files. This may take a while.[/bold red]"
+        )
+        models_in_diff = {file_path.split("/")[-2] for file_path in args.files}
+    elif converter_changed_in_diff():
+        # The converter (or its dependency-mapping helper) is in the diff: its output can shift
+        # for any model, so restrict-by-diff would miss regressions. Force a full check.
+        console.print("[bold yellow]Converter change detected in diff; checking all modular files.[/bold yellow]")
+        args.check_all = True
+        models_in_diff = {file_path.split("/")[-2] for file_path in args.files}
+    else:
+        models_in_diff = get_models_in_diff()
+        if not models_in_diff and not args.check_all:
+            exit(0)
+
+    non_matching_files = []
+    ordered_files, dependencies = find_priority_list(args.files)
+    flat_ordered_files = [item for sublist in ordered_files for item in sublist]
+
+    # ordered_files is a *sorted* list of lists of filepaths
+    #  - files from the first list do NOT depend on other files
+    #  - files in the second list depend on files from the first list
+    #  - files in the third list depend on files from the second and (optionally) the first list
+    #  - ... and so on
+    # files (models) within the same list are *independent* of each other;
+    # we start applying modular conversion to each list in parallel, starting from the first list
+    try:
+        for dependency_level_files in ordered_files:
+            # Filter files guaranteed no diff
+            files_to_check = []
+            for file_path in dependency_level_files:
+                if args.check_all or not guaranteed_no_diff(file_path, dependencies, models_in_diff):
+                    files_to_check.append(file_path)
+
+            if not files_to_check:
+                continue
+
+            # Process files with diff
+            num_workers = min(args.num_workers, len(files_to_check))
+            with multiprocessing.Pool(num_workers) as p:
+                try:
+                    is_changed_flags = p.map(
+                        partial(compare_files, show_diff=not args.fix_and_overwrite),
+                        files_to_check,
+                    )
+                except Exception as e:
+                    console.print(
+                        f"[bold red]Failed to convert one or more files in batch: {files_to_check}[/bold red]"
+                    )
+                    console.print(f"[bold red]Error: {e}[/bold red]")
+                    # Try to process files individually to identify which one failed
+                    is_changed_flags = []
+                    for file_path in files_to_check:
+                        try:
+                            result = compare_files(file_path, show_diff=not args.fix_and_overwrite)
+                            is_changed_flags.append(result)
+                        except Exception as individual_error:
+                            console.print(f"[bold red]Failed to convert {file_path}: {individual_error}[/bold red]")
+                            is_changed_flags.append(0)  # Mark as no change to continue processing
+
+            # Collect changed files and their original paths
+            for is_changed, file_path in zip(is_changed_flags, files_to_check):
+                if is_changed:
+                    non_matching_files.append(file_path)
+
+                    # Update changed models, after each round of conversions
+                    # (save model folder name)
+                    models_in_diff.add(file_path.split("/")[-2])
+
+    finally:
+        # Restore overwritten files by modular (if needed)
+        backup_files = glob.glob("**/*" + BACKUP_EXT, recursive=True)
+        for backup_file_path in backup_files:
+            overwritten_path = backup_file_path.replace(BACKUP_EXT, "")
+            if not args.fix_and_overwrite and os.path.exists(overwritten_path):
+                shutil.copy(backup_file_path, overwritten_path)
+            os.remove(backup_file_path)
+
+    if non_matching_files and not args.fix_and_overwrite:
+        diff_models = set(file_path.split("/")[-2] for file_path in non_matching_files)  # noqa
+        models_str = "\n - " + "\n - ".join(sorted(diff_models))
+        raise ValueError(f"Some diff and their modeling code did not match. Models in diff:{models_str}")