first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/trainer/distributed/test_trainer_distributed_ddp.py
+++ b/tests/trainer/distributed/test_trainer_distributed_ddp.py
@@ -0,0 +1,297 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+DDP-specific distributed trainer tests.
+"""
+
+import json
+import os
+import re
+
+from parameterized import parameterized
+
+from transformers.testing_utils import (
+    CaptureStderr,
+    TestCasePlus,
+    backend_device_count,
+    execute_subprocess_async,
+    get_torch_dist_unique_port,
+    require_torch_multi_accelerator,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
+
+from .test_trainer_distributed import CONFIGS_DIR, SCRIPTS_DIR, TrainerDistributedCommon
+
+
+DDP_CONFIG_FILE = os.path.join(CONFIGS_DIR, "ddp.yaml")
+
+dtypes = []
+if is_torch_bf16_available_on_device(torch_device):
+    dtypes += ["bf16"]
+if is_torch_fp16_available_on_device(torch_device):
+    dtypes += ["fp16"]
+
+pure_dtype_params = ["fp32"] + dtypes
+mixed_precision_params = list(dtypes)
+
+
+def _parameterized_custom_name_func(func, param_num, param):
+    param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
+    return f"{func.__name__}_{param_based_name}"
+
+
+class DDPCommandsMixin:
+    """Provides ``get_torchrun_cmd`` and ``get_accelerate_cmd`` for DDP."""
+
+    def get_torchrun_cmd(self, script, script_args=None, num_processes=None):
+        if num_processes is None:
+            num_processes = backend_device_count(torch_device)
+        port = get_torch_dist_unique_port()
+        cmd = [
+            "torchrun",
+            f"--nproc_per_node={num_processes}",
+            "--nnodes=1",
+            f"--master_port={port}",
+            script,
+        ]
+        if script_args:
+            cmd.extend(script_args)
+        return cmd
+
+    def get_accelerate_cmd(
+        self, script, config_file, launch_args=None, script_args=None, num_processes=None, **kwargs
+    ):
+        if num_processes is None:
+            num_processes = backend_device_count(torch_device)
+        port = get_torch_dist_unique_port()
+        cmd = [
+            "accelerate",
+            "launch",
+            "--config_file",
+            config_file,
+            "--num_processes",
+            str(num_processes),
+            "--main_process_port",
+            str(port),
+        ]
+        if launch_args:
+            cmd.extend(launch_args)
+        cmd.append(script)
+        if script_args:
+            cmd.extend(script_args)
+        return cmd
+
+
+@slow
+@require_torch_multi_accelerator
+class TestTrainerDistributedDDP(DDPCommandsMixin, TestCasePlus):
+    # -----------------------------------------------------------------------
+    # accelerate launch tests
+    # -----------------------------------------------------------------------
+    def test_eval_order(self):
+        output_dir = self.get_auto_remove_tmp_dir()
+        script = os.path.join(SCRIPTS_DIR, "eval_ddp.py")
+        cmd = self.get_accelerate_cmd(
+            script,
+            DDP_CONFIG_FILE,
+            script_args=["--output_dir", output_dir],
+        )
+        execute_subprocess_async(cmd, env=self.get_env())
+
+    def test_loss_averaging(self):
+        device_count = backend_device_count(torch_device)
+        min_bs = 2
+        output_dir = self.get_auto_remove_tmp_dir()
+        script = os.path.join(SCRIPTS_DIR, "loss_averaging.py")
+
+        # Launch 1: single-GPU baseline
+        cmd = self.get_accelerate_cmd(
+            script,
+            DDP_CONFIG_FILE,
+            script_args=[
+                "--output_dir",
+                f"{output_dir}/base",
+                "--per_device_train_batch_size",
+                str(min_bs * device_count),
+                "--average_tokens_across_devices",
+                "True",
+            ],
+            num_processes=1,
+        )
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        # Launch 2: multi-GPU with both averaging modes in one process
+        cmd = self.get_accelerate_cmd(
+            script,
+            DDP_CONFIG_FILE,
+            script_args=[
+                "--output_dir",
+                f"{output_dir}/multi",
+                "--per_device_train_batch_size",
+                str(min_bs),
+                "--run_both_averaging_modes",
+            ],
+            num_processes=device_count,
+        )
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        with open(f"{output_dir}/base_losses.json") as f:
+            base_loss = json.load(f)
+        with open(f"{output_dir}/multi/broken_losses.json") as f:
+            broken_loss = json.load(f)
+        with open(f"{output_dir}/multi/fixed_losses.json") as f:
+            fixed_loss = json.load(f)
+
+        broken_diff = [abs(base_loss[i] - broken_loss[i]) for i in range(len(base_loss))]
+        fixed_diff = [abs(base_loss[i] - fixed_loss[i]) for i in range(len(base_loss))]
+        sum_base = sum(base_loss)
+        sum_broken = sum(broken_loss)
+        relative_broken = abs(sum_base - sum_broken) / max(sum_base, sum_broken)
+
+        self.assertGreater(max(broken_diff), 0.5)
+        self.assertLess(max(fixed_diff), 0.005)
+        self.assertLess(relative_broken, 0.1)
+
+    def test_worker_seed(self):
+        output_dir = self.get_auto_remove_tmp_dir()
+        script = os.path.join(SCRIPTS_DIR, "worker_seed.py")
+        cmd = self.get_accelerate_cmd(
+            script,
+            DDP_CONFIG_FILE,
+            script_args=["--output_dir", output_dir],
+        )
+        execute_subprocess_async(cmd, env=self.get_env())
+
+    # -----------------------------------------------------------------------
+    # torchrun vs accelerate env parity
+    # -----------------------------------------------------------------------
+    def test_torchrun_accelerate_env_parity(self):
+        """Verify torchrun and accelerate launch produce the same distributed environment for DDP."""
+        script = os.path.join(SCRIPTS_DIR, "torchrun_env_check.py")
+        num_processes = backend_device_count(torch_device)
+
+        torchrun_dir = self.get_auto_remove_tmp_dir()
+        cmd = self.get_torchrun_cmd(script, script_args=["--output_dir", torchrun_dir], num_processes=num_processes)
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        accelerate_dir = self.get_auto_remove_tmp_dir()
+        cmd = self.get_accelerate_cmd(
+            script, DDP_CONFIG_FILE, script_args=["--output_dir", accelerate_dir], num_processes=num_processes
+        )
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        for rank in range(num_processes):
+            with open(os.path.join(torchrun_dir, f"env_rank{rank}.json")) as f:
+                tr = json.load(f)
+            with open(os.path.join(accelerate_dir, f"env_rank{rank}.json")) as f:
+                ac = json.load(f)
+
+            for info in (tr, ac):
+                # Rank consistency: env vars, TrainingArguments, and accelerator all agree
+                self.assertEqual(info["env_world_size"], str(num_processes))
+                self.assertEqual(info["env_rank"], str(rank))
+                self.assertEqual(info["env_local_rank"], str(rank))
+                self.assertEqual(info["args_process_index"], rank)
+                self.assertEqual(info["args_local_process_index"], rank)
+                self.assertIn(info["args_local_rank"], (rank, -1))  # may be -1 before framework consumes it
+                self.assertEqual(info["accelerator_process_index"], rank)
+                self.assertEqual(info["accelerator_local_process_index"], rank)
+                self.assertIsNotNone(info["env_master_addr"])
+                self.assertIsNotNone(info["env_master_port"])
+
+                # World size and parallel mode
+                self.assertEqual(info["args_world_size"], num_processes)
+                self.assertEqual(info["args_n_gpu"], 1)
+                self.assertEqual(info["args_parallel_mode"], "ParallelMode.DISTRIBUTED")
+                self.assertEqual(info["accelerator_num_processes"], num_processes)
+                self.assertTrue(info["accelerator_use_distributed"])
+                self.assertEqual(info["accelerator_is_main_process"], rank == 0)
+                self.assertEqual(info["accelerator_is_local_main_process"], rank == 0)
+
+                # DDP: distributed type is MULTI_GPU
+                self.assertEqual(info["accelerator_distributed_type"], "DistributedType.MULTI_GPU")
+
+                # Each rank on its own device
+                self.assertIn(f"{torch_device}:{rank}", info["accelerator_device"])
+
+                # DDP should not activate FSDP or DeepSpeed
+                self.assertFalse(info["trainer_is_fsdp_enabled"])
+                self.assertFalse(info["trainer_is_deepspeed_enabled"])
+                self.assertNotIn("fsdp_version", info)
+                self.assertNotIn("deepspeed_zero_stage", info)
+
+    @parameterized.expand(
+        [
+            ("base", "--log_level info", 1),
+            ("low", "--log_level debug --log_level_replica debug", 2),
+            ("high", "--log_level error --log_level_replica debug", 1),
+            ("mixed", "--log_level error --log_level_replica error", 0),
+        ]
+    )
+    def test_log_level_replica(self, _name, extra_args_str, expected_matches):
+        """Test that log_level_replica controls logging on non-main processes."""
+        output_dir = self.get_auto_remove_tmp_dir()
+        script = os.path.join(SCRIPTS_DIR, "train.py")
+        script_args = [
+            "--output_dir",
+            output_dir,
+            "--num_train_epochs",
+            "1",
+            "--per_device_train_batch_size",
+            "4",
+            "--logging_strategy",
+            "no",
+        ]
+        if extra_args_str:
+            script_args.extend(extra_args_str.split())
+        cmd = self.get_accelerate_cmd(script, DDP_CONFIG_FILE, script_args=script_args, num_processes=2)
+        log_info_string = "Running training"
+        with CaptureStderr() as cl:
+            execute_subprocess_async(cmd, env=self.get_env())
+        n_matches = len(re.findall(log_info_string, cl.err))
+        self.assertEqual(n_matches, expected_matches)
+
+
+# ---------------------------------------------------------------------------
+# DDP training integration tests (using train.py)
+# ---------------------------------------------------------------------------
+
+
+@slow
+@require_torch_multi_accelerator
+class TestTrainerDistributedDDPCommon(DDPCommandsMixin, TrainerDistributedCommon, TestCasePlus):
+    """
+    Distributed DDP training tests using ``accelerate launch`` with the shared
+    train.py script. Mirrors the test structure used in FSDP and DeepSpeed.
+    """
+
+    @parameterized.expand(pure_dtype_params, name_func=_parameterized_custom_name_func)
+    def test_training(self, dtype):
+        self.check_training(dtype, config_file=DDP_CONFIG_FILE)
+
+    @parameterized.expand(mixed_precision_params, name_func=_parameterized_custom_name_func)
+    def test_training_mixed_precision(self, dtype):
+        self.check_mixed_precision(dtype, config_file=DDP_CONFIG_FILE)
+
+    def test_training_with_gradient_accumulation(self):
+        self.check_gradient_accumulation(config_file=DDP_CONFIG_FILE)
+
+    def test_training_and_can_resume_normally(self):
+        self.check_resume(config_file=DDP_CONFIG_FILE)
+
+    def test_eval(self):
+        self.check_eval(config_file=DDP_CONFIG_FILE)