first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/trainer/distributed/scripts/dispatch_batches.py
+++ b/tests/trainer/distributed/scripts/dispatch_batches.py
@@ -0,0 +1,88 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Worker script for dispatch_batches=False with a finite iterable dataset.
+
+Verifies that training completes successfully when ``dispatch_batches``
+is disabled.
+
+Run via torchrun or accelerate launch.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import IterableDataset
+
+from transformers import HfArgumentParser, Trainer, TrainingArguments
+
+
+class RegressionModel(nn.Module):
+    def __init__(self, a=0, b=0):
+        super().__init__()
+        self.a = nn.Parameter(torch.tensor(a).float())
+        self.b = nn.Parameter(torch.tensor(b).float())
+        self.config = None
+
+    def forward(self, input_x, labels=None, **kwargs):
+        y = input_x * self.a + self.b
+        if labels is None:
+            return (y,)
+        loss = nn.functional.mse_loss(y, labels)
+        return (loss, y)
+
+
+class RegressionDataset:
+    def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+        np.random.seed(seed)
+        self.label_names = ["labels"] if label_names is None else label_names
+        self.length = length
+        self.x = np.random.normal(size=(length,)).astype(np.float32)
+        self.ys = [a * self.x + b + np.random.normal(scale=0.1, size=(length,)) for _ in self.label_names]
+        self.ys = [y.astype(np.float32) for y in self.ys]
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i):
+        result = {name: y[i] for name, y in zip(self.label_names, self.ys)}
+        result["input_x"] = self.x[i]
+        return result
+
+
+class FiniteIterableDataset(IterableDataset):
+    def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+        self.dataset = RegressionDataset(a=a, b=b, length=length, seed=seed, label_names=label_names)
+        self.current_sample = 0
+
+    def __iter__(self):
+        while self.current_sample < len(self.dataset):
+            yield self.dataset[self.current_sample]
+            self.current_sample += 1
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((TrainingArguments,))
+    training_args = parser.parse_args_into_dataclasses()[0]
+
+    training_args.per_device_train_batch_size = 1
+    training_args.max_steps = 1
+    training_args.accelerator_config.dispatch_batches = False
+
+    train_dataset = FiniteIterableDataset(label_names=["labels", "extra"], length=1)
+    model = RegressionModel()
+
+    trainer = Trainer(model, training_args, train_dataset=train_dataset)
+    trainer.train()
--- a/tests/trainer/distributed/scripts/ds_config_zero2.json
+++ b/tests/trainer/distributed/scripts/ds_config_zero2.json
@@ -0,0 +1,32 @@
+{
+    "fp16": {
+        "enabled": "auto"
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto"
+}
--- a/tests/trainer/distributed/scripts/ds_config_zero3.json
+++ b/tests/trainer/distributed/scripts/ds_config_zero3.json
@@ -0,0 +1,35 @@
+{
+    "fp16": {
+        "enabled": "auto"
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto"
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto"
+}
--- a/tests/trainer/distributed/scripts/eval_ddp.py
+++ b/tests/trainer/distributed/scripts/eval_ddp.py
@@ -0,0 +1,113 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Worker script for eval/predict ordering tests.
+
+Verifies that distributed eval/predict returns all samples in the correct order.
+
+Run via torchrun or accelerate launch.
+"""
+
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+
+from transformers import EvalPrediction, HfArgumentParser, Trainer, TrainingArguments
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DummyDataset(Dataset):
+    def __init__(self, length: int = 101):
+        self.length = length
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i) -> int:
+        return i
+
+
+class DummyDataCollator:
+    def __call__(self, features):
+        return {"input_ids": torch.tensor(features), "labels": torch.tensor(features)}
+
+
+class DummyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # Add some (unused) params otherwise DDP will complain.
+        self.fc = nn.Linear(120, 80)
+
+    def forward(self, input_ids, labels=None):
+        if labels is not None:
+            return torch.tensor(0.0, device=input_ids.device), input_ids
+        else:
+            return input_ids
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((TrainingArguments,))
+    training_args = parser.parse_args_into_dataclasses()[0]
+
+    for dataset_length in [49, 7]:
+        dataset = DummyDataset(dataset_length)
+
+        def compute_metrics(p: EvalPrediction) -> dict:
+            sequential = list(range(len(dataset)))
+            success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential
+            if not success and training_args.local_process_index == 0:
+                logger.warning(
+                    "Predictions and/or labels do not match expected results:\n  - predictions: "
+                    f"{p.predictions.tolist()}\n  - labels: {p.label_ids.tolist()}\n  - expected: {sequential}"
+                )
+            return {"success": success}
+
+        trainer = Trainer(
+            model=DummyModel(),
+            args=training_args,
+            data_collator=DummyDataCollator(),
+            eval_dataset=dataset,
+            compute_metrics=compute_metrics,
+        )
+        metrics = trainer.evaluate()
+        logger.info(metrics)
+        if metrics["eval_success"] is not True:
+            logger.error(metrics)
+            exit(1)
+
+        p = trainer.predict(dataset)
+        logger.info(p.metrics)
+        if p.metrics["test_success"] is not True:
+            logger.error(p.metrics)
+            exit(1)
+
+        trainer.args.eval_accumulation_steps = 2
+
+        metrics = trainer.evaluate()
+        logger.info(metrics)
+        if metrics["eval_success"] is not True:
+            logger.error(metrics)
+            exit(1)
+
+        p = trainer.predict(dataset)
+        logger.info(p.metrics)
+        if p.metrics["test_success"] is not True:
+            logger.error(p.metrics)
+            exit(1)
+
+        trainer.args.eval_accumulation_steps = None
--- a/tests/trainer/distributed/scripts/fsdp_generate.py
+++ b/tests/trainer/distributed/scripts/fsdp_generate.py
@@ -0,0 +1,125 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Worker script for FSDP generation tests.
+
+Launched via ``torchrun`` from ``test_trainer_distributed_fsdp.py``.
+"""
+
+import argparse
+import functools
+from collections.abc import Callable
+from typing import Any
+
+import torch
+import torch.distributed
+from torch.distributed._composable.fsdp import fully_shard, register_fsdp_forward_method
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block
+from transformers.testing_utils import backend_device_count, backend_torch_accelerator_module, torch_device
+
+
+data = 4 * [
+    "Hello world!",
+    "The quick brown fox jumps over the lazy dog.",
+]
+
+
+def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
+    """Manage the creation and destruction of the distributed process group for the wrapped function."""
+
+    def wrapped(*args: Any, **kwargs: Any) -> Any:
+        device_count = backend_device_count(torch_device)
+        torch.distributed.init_process_group(world_size=device_count)
+        try:
+            return func(*args, **kwargs)
+        finally:
+            torch.distributed.destroy_process_group()
+
+    return wrapped
+
+
+@manage_process_group
+def fsdp_generate():
+    torch_accelerator_module = backend_torch_accelerator_module(torch_device)
+    torch_accelerator_module.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+
+    model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
+
+    fsdp_model = FullyShardedDataParallel(
+        model,
+        auto_wrap_policy=functools.partial(transformer_auto_wrap_policy, transformer_layer_cls={GPT2Block}),
+        limit_all_gathers=True,
+        use_orig_params=True,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+    batch = tokenizer(data[rank], return_tensors="pt", return_attention_mask=True).to(device)
+
+    with FullyShardedDataParallel.summon_full_params(fsdp_model):
+        _ = fsdp_model.module.generate(
+            input_ids=batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            max_length=30,
+        )
+
+
+@manage_process_group
+def fsdp2_generate():
+    torch_accelerator_module = backend_torch_accelerator_module(torch_device)
+    torch_accelerator_module.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+
+    model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
+
+    mesh = init_device_mesh(device.type, (torch.distributed.get_world_size(),))
+    for submodule in model.modules():
+        if isinstance(submodule, GPT2Block):
+            fully_shard(submodule, mesh=mesh)
+    fully_shard(model, mesh=mesh)
+
+    register_fsdp_forward_method(model, "generate")
+
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+    batch = tokenizer(data[rank], return_tensors="pt", return_attention_mask=True).to(device)
+
+    _ = model.generate(
+        input_ids=batch["input_ids"],
+        attention_mask=batch["attention_mask"],
+        max_length=30,
+    )
+
+
+if __name__ == "__main__":
+
+    class CLIArgs(argparse.Namespace):
+        fsdp: bool
+        fsdp2: bool
+
+    parser = argparse.ArgumentParser()
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("--fsdp", action="store_true")
+    group.add_argument("--fsdp2", action="store_true")
+    args = parser.parse_args(namespace=CLIArgs())
+
+    if args.fsdp:
+        fsdp_generate()
+    elif args.fsdp2:
+        fsdp2_generate()
+    else:
+        raise ValueError("Missing test selection")
--- a/tests/trainer/distributed/scripts/loss_averaging.py
+++ b/tests/trainer/distributed/scripts/loss_averaging.py
@@ -0,0 +1,114 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Worker script for loss averaging tests.
+
+Verifies that ``average_tokens_across_devices`` produces correct loss
+compared to a single-GPU baseline.
+
+When ``--run_both_averaging_modes`` is passed, the script runs training
+twice (with and without averaging) in a single process launch, saving
+``<output_dir>_broken_losses.json`` and ``<output_dir>_fixed_losses.json``.
+
+Run via torchrun or accelerate launch.
+"""
+
+import argparse
+import json
+
+import datasets
+import torch
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    HfArgumentParser,
+    Trainer,
+    TrainerCallback,
+    TrainingArguments,
+    set_seed,
+)
+
+
+class StoreLossCallback(TrainerCallback):
+    """Simple callback to store the loss."""
+
+    def __init__(self):
+        self.losses = []
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if "loss" in logs:
+            self.losses.append(logs["loss"])
+
+
+def run_distributed_training(training_args, loss_file):
+    set_seed(42)
+    model_name = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
+    dataset_name = "wikitext"
+    dataset_config = "wikitext-2-raw-v1"
+    dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:50]")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], max_length=128, padding="max_length", truncation=True)
+
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
+
+    loss_callback = StoreLossCallback()
+
+    training_args.logging_steps = 1
+    training_args.max_steps = 10
+    training_args.learning_rate = 3e-4
+    training_args.disable_tqdm = True
+    training_args.dataloader_drop_last = True
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_dataset,
+        callbacks=[loss_callback],
+        data_collator=data_collator,
+    )
+    trainer.train()
+    with open(loss_file, "w") as f:
+        json.dump(loss_callback.losses, f)
+
+
+if __name__ == "__main__":
+    # Parse our custom flag first, pass the rest to HfArgumentParser.
+    pre_parser = argparse.ArgumentParser(add_help=False)
+    pre_parser.add_argument("--run_both_averaging_modes", action="store_true")
+    custom_args, remaining = pre_parser.parse_known_args()
+
+    hf_parser = HfArgumentParser((TrainingArguments,))
+    (training_args,) = hf_parser.parse_args_into_dataclasses(remaining)
+
+    if custom_args.run_both_averaging_modes:
+        base_dir = training_args.output_dir
+        # Run without averaging ("broken")
+        training_args.average_tokens_across_devices = False
+        training_args.output_dir = base_dir + "/broken"
+        run_distributed_training(training_args, loss_file=base_dir + "/broken_losses.json")
+        # Run with averaging ("fixed")
+        training_args.average_tokens_across_devices = True
+        training_args.output_dir = base_dir + "/fixed"
+        run_distributed_training(training_args, loss_file=base_dir + "/fixed_losses.json")
+    else:
+        run_distributed_training(training_args, loss_file=training_args.output_dir + "_losses.json")
--- a/tests/trainer/distributed/scripts/torchrun_env_check.py
+++ b/tests/trainer/distributed/scripts/torchrun_env_check.py
@@ -0,0 +1,93 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dumps distributed environment info to a JSON file for verification.
+
+This script creates a Trainer (which initializes the accelerator) and writes
+each worker's env vars, TrainingArguments fields, and accelerator state to
+``<output_dir>/env_rank<N>.json``.
+
+Accepts all TrainingArguments flags (e.g. ``--deepspeed``, ``--fsdp``) so the
+Trainer sets up the correct framework regardless of launcher.
+
+Works with any launcher (torchrun, accelerate launch with DDP/FSDP/DeepSpeed).
+"""
+
+import json
+import os
+
+from transformers import AutoModelForCausalLM, HfArgumentParser, Trainer, TrainingArguments
+
+
+def main():
+    parser = HfArgumentParser((TrainingArguments,))
+    (args,) = parser.parse_args_into_dataclasses()
+    args.disable_tqdm = True
+
+    model_name = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+
+    trainer = Trainer(model=model, args=args)
+    accelerator = trainer.accelerator
+
+    env_info = {
+        # Raw env vars set by torchrun / accelerate
+        "env_world_size": os.environ.get("WORLD_SIZE"),
+        "env_rank": os.environ.get("RANK"),
+        "env_local_rank": os.environ.get("LOCAL_RANK"),
+        "env_master_addr": os.environ.get("MASTER_ADDR"),
+        "env_master_port": os.environ.get("MASTER_PORT"),
+        # TrainingArguments-derived values
+        "args_local_rank": args.local_rank,
+        "args_world_size": args.world_size,
+        "args_process_index": args.process_index,
+        "args_local_process_index": args.local_process_index,
+        "args_parallel_mode": str(args.parallel_mode),
+        "args_n_gpu": args.n_gpu,
+        # Accelerator state
+        "accelerator_num_processes": accelerator.num_processes,
+        "accelerator_process_index": accelerator.process_index,
+        "accelerator_local_process_index": accelerator.local_process_index,
+        "accelerator_is_main_process": accelerator.is_main_process,
+        "accelerator_is_local_main_process": accelerator.is_local_main_process,
+        "accelerator_use_distributed": accelerator.use_distributed,
+        "accelerator_distributed_type": str(accelerator.distributed_type),
+        "accelerator_device": str(accelerator.device),
+        # Trainer-level flags (these gate framework-specific code paths)
+        "trainer_is_fsdp_enabled": trainer.is_fsdp_enabled,
+        "trainer_is_deepspeed_enabled": trainer.is_deepspeed_enabled,
+    }
+
+    # FSDP plugin info
+    fsdp_plugin = getattr(accelerator.state, "fsdp_plugin", None)
+    if fsdp_plugin is not None:
+        env_info["fsdp_version"] = getattr(fsdp_plugin, "fsdp_version", None)
+        env_info["fsdp_sharding_strategy"] = str(getattr(fsdp_plugin, "sharding_strategy", None))
+        env_info["fsdp_cpu_offload"] = str(getattr(fsdp_plugin, "cpu_offload", None))
+        env_info["fsdp_auto_wrap_policy"] = str(getattr(fsdp_plugin, "auto_wrap_policy", None))
+
+    # DeepSpeed plugin info
+    deepspeed_plugin = getattr(accelerator.state, "deepspeed_plugin", None)
+    if deepspeed_plugin is not None:
+        env_info["deepspeed_zero_stage"] = deepspeed_plugin.zero_stage
+        env_info["deepspeed_offload_optimizer_device"] = str(deepspeed_plugin.offload_optimizer_device)
+        env_info["deepspeed_offload_param_device"] = str(deepspeed_plugin.offload_param_device)
+
+    output_file = os.path.join(args.output_dir, f"env_rank{args.process_index}.json")
+    with open(output_file, "w") as f:
+        json.dump(env_info, f)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/trainer/distributed/scripts/train.py
+++ b/tests/trainer/distributed/scripts/train.py
@@ -0,0 +1,136 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Simple causal LM script for distributed tests (FSDP, DeepSpeed).
+
+Uses a tiny Qwen2 model with synthetic data so tests run fast
+and don't require downloading real datasets.
+
+Supports --do_train (default) and --do_eval via TrainingArguments.
+
+32 training samples are created; with per_device_train_batch_size=4
+and 2 GPUs this gives 4 steps per epoch.
+"""
+
+import json
+import sys
+
+import torch
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+)
+
+
+DTYPE_MAP = {"fp32": torch.float32, "bf16": torch.bfloat16, "fp16": torch.float16}
+
+
+def _pop_custom_arg(name):
+    """Pop a custom --name value arg from sys.argv before HfArgumentParser sees it."""
+    if name in sys.argv:
+        idx = sys.argv.index(name)
+        value = sys.argv[idx + 1]
+        sys.argv.pop(idx)
+        sys.argv.pop(idx)
+        return value
+    return None
+
+
+def main():
+    # Parse custom args (not TrainingArguments fields)
+    model_name = _pop_custom_arg("--model_name") or "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
+    loss_output_file = _pop_custom_arg("--loss_output_file")
+    eval_output_file = _pop_custom_arg("--eval_output_file")
+    model_dtype = _pop_custom_arg("--model_dtype")
+    attn_impl = _pop_custom_arg("--attn_implementation")
+    pad_to_multiple_of = _pop_custom_arg("--pad_to_multiple_of")
+
+    parser = HfArgumentParser((TrainingArguments,))
+    (training_args,) = parser.parse_args_into_dataclasses()
+
+    # Default to training if neither --do_train nor --do_eval is set
+    if not training_args.do_train and not training_args.do_eval:
+        training_args.do_train = True
+
+    # Auto-enable eval when an eval output file is requested
+    if eval_output_file:
+        training_args.do_eval = True
+
+    torch_dtype = DTYPE_MAP[model_dtype] if model_dtype else None
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model_kwargs = {}
+    if torch_dtype:
+        model_kwargs["torch_dtype"] = torch_dtype
+    if attn_impl:
+        model_kwargs["attn_implementation"] = attn_impl
+    model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
+    model.generation_config.pad_token_id = tokenizer.pad_token_id
+
+    # Synthetic dataset — 32 samples of tokenized text
+    # With per_device_train_batch_size=4 and 2 GPUs this gives 4 steps per epoch.
+    texts = [
+        "The quick brown fox jumps over the lazy dog. " * 5,
+        "A journey of a thousand miles begins with a single step. " * 5,
+        "To be or not to be, that is the question. " * 5,
+        "All that glitters is not gold, all that wanders is not lost. " * 5,
+    ] * 8
+
+    train_dataset = None
+    eval_dataset = None
+    if training_args.do_train:
+        train_dataset = [tokenizer(text, max_length=128, truncation=True, padding="max_length") for text in texts]
+    if training_args.do_eval:
+        eval_dataset = [tokenizer(text, max_length=128, truncation=True, padding="max_length") for text in texts[:8]]
+
+    collator_kwargs = {}
+    if pad_to_multiple_of:
+        collator_kwargs["pad_to_multiple_of"] = int(pad_to_multiple_of)
+
+    training_args.disable_tqdm = True
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, **collator_kwargs),
+    )
+
+    if training_args.do_train:
+        trainer.train()
+
+    if training_args.do_eval:
+        eval_metrics = trainer.evaluate()
+        if eval_output_file and training_args.process_index == 0:
+            with open(eval_output_file, "w") as f:
+                json.dump(eval_metrics, f)
+
+    # Save per-step losses for equivalence testing
+    if training_args.do_train and loss_output_file and training_args.process_index == 0:
+        losses = [log["loss"] for log in trainer.state.log_history if "loss" in log]
+        with open(loss_output_file, "w") as f:
+            json.dump(losses, f)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/trainer/distributed/scripts/vit_feature_extractor.json
+++ b/tests/trainer/distributed/scripts/vit_feature_extractor.json
@@ -0,0 +1,4 @@
+{
+    "image_processor_type": "ViTImageProcessor",
+    "size": 30
+}
--- a/tests/trainer/distributed/scripts/worker_seed.py
+++ b/tests/trainer/distributed/scripts/worker_seed.py
@@ -0,0 +1,87 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Worker script for dataloader worker seed divergence tests.
+
+Verifies that dataloader workers get different random seeds across GPUs,
+so that each rank sees different random augmentations.
+
+Run via torchrun or accelerate launch.
+"""
+
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.utils.data import Dataset
+
+from transformers import HfArgumentParser, Trainer, TrainingArguments, set_seed
+from transformers.testing_utils import torch_device
+
+
+def gather_from_all_gpus(tensor, world_size):
+    gather_list = [torch.zeros_like(tensor) for _ in range(world_size)]
+    dist.all_gather(gather_list, tensor)
+    return gather_list
+
+
+class DummyDataset(Dataset):
+    def __init__(self):
+        self.length = 64
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i) -> int:
+        x = random.random()
+        y = np.random.random()
+        z = torch.rand([]).item()
+        return {"x": torch.tensor([x, y, z])}
+
+
+class DummyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = nn.Linear(3, 1)
+
+    def forward(self, x):
+        local_tensor = torch.tensor(x, device=torch_device)
+        gathered = gather_from_all_gpus(local_tensor, dist.get_world_size())
+        assert not all(torch.allclose(t, gathered[0]) for t in gathered[1:])
+        y = self.fc(x)
+        return (y.mean(), y)
+
+
+def run_distributed_training(training_args):
+    set_seed(42)
+    model = DummyModel()
+    dataset = DummyDataset()
+    training_args.max_steps = 3
+    # dataloader_num_workers must be > 0 to enable worker_init_fn
+    training_args.dataloader_num_workers = 2
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=dataset,
+    )
+    trainer.train()
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((TrainingArguments,))
+    training_args = parser.parse_args_into_dataclasses()[0]
+    run_distributed_training(training_args)