first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/trainer/distributed/scripts/worker_seed.py
+++ b/tests/trainer/distributed/scripts/worker_seed.py
@@ -0,0 +1,87 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Worker script for dataloader worker seed divergence tests.
+
+Verifies that dataloader workers get different random seeds across GPUs,
+so that each rank sees different random augmentations.
+
+Run via torchrun or accelerate launch.
+"""
+
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.utils.data import Dataset
+
+from transformers import HfArgumentParser, Trainer, TrainingArguments, set_seed
+from transformers.testing_utils import torch_device
+
+
+def gather_from_all_gpus(tensor, world_size):
+    gather_list = [torch.zeros_like(tensor) for _ in range(world_size)]
+    dist.all_gather(gather_list, tensor)
+    return gather_list
+
+
+class DummyDataset(Dataset):
+    def __init__(self):
+        self.length = 64
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i) -> int:
+        x = random.random()
+        y = np.random.random()
+        z = torch.rand([]).item()
+        return {"x": torch.tensor([x, y, z])}
+
+
+class DummyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = nn.Linear(3, 1)
+
+    def forward(self, x):
+        local_tensor = torch.tensor(x, device=torch_device)
+        gathered = gather_from_all_gpus(local_tensor, dist.get_world_size())
+        assert not all(torch.allclose(t, gathered[0]) for t in gathered[1:])
+        y = self.fc(x)
+        return (y.mean(), y)
+
+
+def run_distributed_training(training_args):
+    set_seed(42)
+    model = DummyModel()
+    dataset = DummyDataset()
+    training_args.max_steps = 3
+    # dataloader_num_workers must be > 0 to enable worker_init_fn
+    training_args.dataloader_num_workers = 2
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=dataset,
+    )
+    trainer.train()
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((TrainingArguments,))
+    training_args = parser.parse_args_into_dataclasses()[0]
+    run_distributed_training(training_args)