first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

@@ -0,0 +1,88 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Worker script for dispatch_batches=False with a finite iterable dataset.
Verifies that training completes successfully when ``dispatch_batches``
is disabled.
Run via torchrun or accelerate launch.
"""
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import IterableDataset
from transformers import HfArgumentParser, Trainer, TrainingArguments
class RegressionModel(nn.Module):
def __init__(self, a=0, b=0):
super().__init__()
self.a = nn.Parameter(torch.tensor(a).float())
self.b = nn.Parameter(torch.tensor(b).float())
self.config = None
def forward(self, input_x, labels=None, **kwargs):
y = input_x * self.a + self.b
if labels is None:
return (y,)
loss = nn.functional.mse_loss(y, labels)
return (loss, y)
class RegressionDataset:
def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
np.random.seed(seed)
self.label_names = ["labels"] if label_names is None else label_names
self.length = length
self.x = np.random.normal(size=(length,)).astype(np.float32)
self.ys = [a * self.x + b + np.random.normal(scale=0.1, size=(length,)) for _ in self.label_names]
self.ys = [y.astype(np.float32) for y in self.ys]
def __len__(self):
return self.length
def __getitem__(self, i):
result = {name: y[i] for name, y in zip(self.label_names, self.ys)}
result["input_x"] = self.x[i]
return result
class FiniteIterableDataset(IterableDataset):
def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
self.dataset = RegressionDataset(a=a, b=b, length=length, seed=seed, label_names=label_names)
self.current_sample = 0
def __iter__(self):
while self.current_sample < len(self.dataset):
yield self.dataset[self.current_sample]
self.current_sample += 1
if __name__ == "__main__":
parser = HfArgumentParser((TrainingArguments,))
training_args = parser.parse_args_into_dataclasses()[0]
training_args.per_device_train_batch_size = 1
training_args.max_steps = 1
training_args.accelerator_config.dispatch_batches = False
train_dataset = FiniteIterableDataset(label_names=["labels", "extra"], length=1)
model = RegressionModel()
trainer = Trainer(model, training_args, train_dataset=train_dataset)
trainer.train()

View File

@@ -0,0 +1,32 @@
{
"fp16": {
"enabled": "auto"
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}

View File

@@ -0,0 +1,35 @@
{
"fp16": {
"enabled": "auto"
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto"
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}

View File

@@ -0,0 +1,113 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Worker script for eval/predict ordering tests.
Verifies that distributed eval/predict returns all samples in the correct order.
Run via torchrun or accelerate launch.
"""
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import EvalPrediction, HfArgumentParser, Trainer, TrainingArguments
from transformers.utils import logging
logger = logging.get_logger(__name__)
class DummyDataset(Dataset):
def __init__(self, length: int = 101):
self.length = length
def __len__(self):
return self.length
def __getitem__(self, i) -> int:
return i
class DummyDataCollator:
def __call__(self, features):
return {"input_ids": torch.tensor(features), "labels": torch.tensor(features)}
class DummyModel(nn.Module):
def __init__(self):
super().__init__()
# Add some (unused) params otherwise DDP will complain.
self.fc = nn.Linear(120, 80)
def forward(self, input_ids, labels=None):
if labels is not None:
return torch.tensor(0.0, device=input_ids.device), input_ids
else:
return input_ids
if __name__ == "__main__":
parser = HfArgumentParser((TrainingArguments,))
training_args = parser.parse_args_into_dataclasses()[0]
for dataset_length in [49, 7]:
dataset = DummyDataset(dataset_length)
def compute_metrics(p: EvalPrediction) -> dict:
sequential = list(range(len(dataset)))
success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential
if not success and training_args.local_process_index == 0:
logger.warning(
"Predictions and/or labels do not match expected results:\n - predictions: "
f"{p.predictions.tolist()}\n - labels: {p.label_ids.tolist()}\n - expected: {sequential}"
)
return {"success": success}
trainer = Trainer(
model=DummyModel(),
args=training_args,
data_collator=DummyDataCollator(),
eval_dataset=dataset,
compute_metrics=compute_metrics,
)
metrics = trainer.evaluate()
logger.info(metrics)
if metrics["eval_success"] is not True:
logger.error(metrics)
exit(1)
p = trainer.predict(dataset)
logger.info(p.metrics)
if p.metrics["test_success"] is not True:
logger.error(p.metrics)
exit(1)
trainer.args.eval_accumulation_steps = 2
metrics = trainer.evaluate()
logger.info(metrics)
if metrics["eval_success"] is not True:
logger.error(metrics)
exit(1)
p = trainer.predict(dataset)
logger.info(p.metrics)
if p.metrics["test_success"] is not True:
logger.error(p.metrics)
exit(1)
trainer.args.eval_accumulation_steps = None

View File

@@ -0,0 +1,125 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Worker script for FSDP generation tests.
Launched via ``torchrun`` from ``test_trainer_distributed_fsdp.py``.
"""
import argparse
import functools
from collections.abc import Callable
from typing import Any
import torch
import torch.distributed
from torch.distributed._composable.fsdp import fully_shard, register_fsdp_forward_method
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.fsdp import FullyShardedDataParallel
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.gpt2.modeling_gpt2 import GPT2Block
from transformers.testing_utils import backend_device_count, backend_torch_accelerator_module, torch_device
data = 4 * [
"Hello world!",
"The quick brown fox jumps over the lazy dog.",
]
def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
"""Manage the creation and destruction of the distributed process group for the wrapped function."""
def wrapped(*args: Any, **kwargs: Any) -> Any:
device_count = backend_device_count(torch_device)
torch.distributed.init_process_group(world_size=device_count)
try:
return func(*args, **kwargs)
finally:
torch.distributed.destroy_process_group()
return wrapped
@manage_process_group
def fsdp_generate():
torch_accelerator_module = backend_torch_accelerator_module(torch_device)
torch_accelerator_module.set_device(device := torch.device(rank := torch.distributed.get_rank()))
model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
fsdp_model = FullyShardedDataParallel(
model,
auto_wrap_policy=functools.partial(transformer_auto_wrap_policy, transformer_layer_cls={GPT2Block}),
limit_all_gathers=True,
use_orig_params=True,
)
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
batch = tokenizer(data[rank], return_tensors="pt", return_attention_mask=True).to(device)
with FullyShardedDataParallel.summon_full_params(fsdp_model):
_ = fsdp_model.module.generate(
input_ids=batch["input_ids"],
attention_mask=batch["attention_mask"],
max_length=30,
)
@manage_process_group
def fsdp2_generate():
torch_accelerator_module = backend_torch_accelerator_module(torch_device)
torch_accelerator_module.set_device(device := torch.device(rank := torch.distributed.get_rank()))
model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
mesh = init_device_mesh(device.type, (torch.distributed.get_world_size(),))
for submodule in model.modules():
if isinstance(submodule, GPT2Block):
fully_shard(submodule, mesh=mesh)
fully_shard(model, mesh=mesh)
register_fsdp_forward_method(model, "generate")
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
batch = tokenizer(data[rank], return_tensors="pt", return_attention_mask=True).to(device)
_ = model.generate(
input_ids=batch["input_ids"],
attention_mask=batch["attention_mask"],
max_length=30,
)
if __name__ == "__main__":
class CLIArgs(argparse.Namespace):
fsdp: bool
fsdp2: bool
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group()
group.add_argument("--fsdp", action="store_true")
group.add_argument("--fsdp2", action="store_true")
args = parser.parse_args(namespace=CLIArgs())
if args.fsdp:
fsdp_generate()
elif args.fsdp2:
fsdp2_generate()
else:
raise ValueError("Missing test selection")

View File

@@ -0,0 +1,114 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Worker script for loss averaging tests.
Verifies that ``average_tokens_across_devices`` produces correct loss
compared to a single-GPU baseline.
When ``--run_both_averaging_modes`` is passed, the script runs training
twice (with and without averaging) in a single process launch, saving
``<output_dir>_broken_losses.json`` and ``<output_dir>_fixed_losses.json``.
Run via torchrun or accelerate launch.
"""
import argparse
import json
import datasets
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
HfArgumentParser,
Trainer,
TrainerCallback,
TrainingArguments,
set_seed,
)
class StoreLossCallback(TrainerCallback):
"""Simple callback to store the loss."""
def __init__(self):
self.losses = []
def on_log(self, args, state, control, logs=None, **kwargs):
if "loss" in logs:
self.losses.append(logs["loss"])
def run_distributed_training(training_args, loss_file):
set_seed(42)
model_name = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
dataset_name = "wikitext"
dataset_config = "wikitext-2-raw-v1"
dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:50]")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(examples["text"], max_length=128, padding="max_length", truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
loss_callback = StoreLossCallback()
training_args.logging_steps = 1
training_args.max_steps = 10
training_args.learning_rate = 3e-4
training_args.disable_tqdm = True
training_args.dataloader_drop_last = True
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_dataset,
callbacks=[loss_callback],
data_collator=data_collator,
)
trainer.train()
with open(loss_file, "w") as f:
json.dump(loss_callback.losses, f)
if __name__ == "__main__":
# Parse our custom flag first, pass the rest to HfArgumentParser.
pre_parser = argparse.ArgumentParser(add_help=False)
pre_parser.add_argument("--run_both_averaging_modes", action="store_true")
custom_args, remaining = pre_parser.parse_known_args()
hf_parser = HfArgumentParser((TrainingArguments,))
(training_args,) = hf_parser.parse_args_into_dataclasses(remaining)
if custom_args.run_both_averaging_modes:
base_dir = training_args.output_dir
# Run without averaging ("broken")
training_args.average_tokens_across_devices = False
training_args.output_dir = base_dir + "/broken"
run_distributed_training(training_args, loss_file=base_dir + "/broken_losses.json")
# Run with averaging ("fixed")
training_args.average_tokens_across_devices = True
training_args.output_dir = base_dir + "/fixed"
run_distributed_training(training_args, loss_file=base_dir + "/fixed_losses.json")
else:
run_distributed_training(training_args, loss_file=training_args.output_dir + "_losses.json")

View File

@@ -0,0 +1,93 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dumps distributed environment info to a JSON file for verification.
This script creates a Trainer (which initializes the accelerator) and writes
each worker's env vars, TrainingArguments fields, and accelerator state to
``<output_dir>/env_rank<N>.json``.
Accepts all TrainingArguments flags (e.g. ``--deepspeed``, ``--fsdp``) so the
Trainer sets up the correct framework regardless of launcher.
Works with any launcher (torchrun, accelerate launch with DDP/FSDP/DeepSpeed).
"""
import json
import os
from transformers import AutoModelForCausalLM, HfArgumentParser, Trainer, TrainingArguments
def main():
parser = HfArgumentParser((TrainingArguments,))
(args,) = parser.parse_args_into_dataclasses()
args.disable_tqdm = True
model_name = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
model = AutoModelForCausalLM.from_pretrained(model_name)
trainer = Trainer(model=model, args=args)
accelerator = trainer.accelerator
env_info = {
# Raw env vars set by torchrun / accelerate
"env_world_size": os.environ.get("WORLD_SIZE"),
"env_rank": os.environ.get("RANK"),
"env_local_rank": os.environ.get("LOCAL_RANK"),
"env_master_addr": os.environ.get("MASTER_ADDR"),
"env_master_port": os.environ.get("MASTER_PORT"),
# TrainingArguments-derived values
"args_local_rank": args.local_rank,
"args_world_size": args.world_size,
"args_process_index": args.process_index,
"args_local_process_index": args.local_process_index,
"args_parallel_mode": str(args.parallel_mode),
"args_n_gpu": args.n_gpu,
# Accelerator state
"accelerator_num_processes": accelerator.num_processes,
"accelerator_process_index": accelerator.process_index,
"accelerator_local_process_index": accelerator.local_process_index,
"accelerator_is_main_process": accelerator.is_main_process,
"accelerator_is_local_main_process": accelerator.is_local_main_process,
"accelerator_use_distributed": accelerator.use_distributed,
"accelerator_distributed_type": str(accelerator.distributed_type),
"accelerator_device": str(accelerator.device),
# Trainer-level flags (these gate framework-specific code paths)
"trainer_is_fsdp_enabled": trainer.is_fsdp_enabled,
"trainer_is_deepspeed_enabled": trainer.is_deepspeed_enabled,
}
# FSDP plugin info
fsdp_plugin = getattr(accelerator.state, "fsdp_plugin", None)
if fsdp_plugin is not None:
env_info["fsdp_version"] = getattr(fsdp_plugin, "fsdp_version", None)
env_info["fsdp_sharding_strategy"] = str(getattr(fsdp_plugin, "sharding_strategy", None))
env_info["fsdp_cpu_offload"] = str(getattr(fsdp_plugin, "cpu_offload", None))
env_info["fsdp_auto_wrap_policy"] = str(getattr(fsdp_plugin, "auto_wrap_policy", None))
# DeepSpeed plugin info
deepspeed_plugin = getattr(accelerator.state, "deepspeed_plugin", None)
if deepspeed_plugin is not None:
env_info["deepspeed_zero_stage"] = deepspeed_plugin.zero_stage
env_info["deepspeed_offload_optimizer_device"] = str(deepspeed_plugin.offload_optimizer_device)
env_info["deepspeed_offload_param_device"] = str(deepspeed_plugin.offload_param_device)
output_file = os.path.join(args.output_dir, f"env_rank{args.process_index}.json")
with open(output_file, "w") as f:
json.dump(env_info, f)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,136 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Simple causal LM script for distributed tests (FSDP, DeepSpeed).
Uses a tiny Qwen2 model with synthetic data so tests run fast
and don't require downloading real datasets.
Supports --do_train (default) and --do_eval via TrainingArguments.
32 training samples are created; with per_device_train_batch_size=4
and 2 GPUs this gives 4 steps per epoch.
"""
import json
import sys
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
HfArgumentParser,
Trainer,
TrainingArguments,
)
DTYPE_MAP = {"fp32": torch.float32, "bf16": torch.bfloat16, "fp16": torch.float16}
def _pop_custom_arg(name):
"""Pop a custom --name value arg from sys.argv before HfArgumentParser sees it."""
if name in sys.argv:
idx = sys.argv.index(name)
value = sys.argv[idx + 1]
sys.argv.pop(idx)
sys.argv.pop(idx)
return value
return None
def main():
# Parse custom args (not TrainingArguments fields)
model_name = _pop_custom_arg("--model_name") or "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
loss_output_file = _pop_custom_arg("--loss_output_file")
eval_output_file = _pop_custom_arg("--eval_output_file")
model_dtype = _pop_custom_arg("--model_dtype")
attn_impl = _pop_custom_arg("--attn_implementation")
pad_to_multiple_of = _pop_custom_arg("--pad_to_multiple_of")
parser = HfArgumentParser((TrainingArguments,))
(training_args,) = parser.parse_args_into_dataclasses()
# Default to training if neither --do_train nor --do_eval is set
if not training_args.do_train and not training_args.do_eval:
training_args.do_train = True
# Auto-enable eval when an eval output file is requested
if eval_output_file:
training_args.do_eval = True
torch_dtype = DTYPE_MAP[model_dtype] if model_dtype else None
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model_kwargs = {}
if torch_dtype:
model_kwargs["torch_dtype"] = torch_dtype
if attn_impl:
model_kwargs["attn_implementation"] = attn_impl
model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
model.generation_config.pad_token_id = tokenizer.pad_token_id
# Synthetic dataset — 32 samples of tokenized text
# With per_device_train_batch_size=4 and 2 GPUs this gives 4 steps per epoch.
texts = [
"The quick brown fox jumps over the lazy dog. " * 5,
"A journey of a thousand miles begins with a single step. " * 5,
"To be or not to be, that is the question. " * 5,
"All that glitters is not gold, all that wanders is not lost. " * 5,
] * 8
train_dataset = None
eval_dataset = None
if training_args.do_train:
train_dataset = [tokenizer(text, max_length=128, truncation=True, padding="max_length") for text in texts]
if training_args.do_eval:
eval_dataset = [tokenizer(text, max_length=128, truncation=True, padding="max_length") for text in texts[:8]]
collator_kwargs = {}
if pad_to_multiple_of:
collator_kwargs["pad_to_multiple_of"] = int(pad_to_multiple_of)
training_args.disable_tqdm = True
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, **collator_kwargs),
)
if training_args.do_train:
trainer.train()
if training_args.do_eval:
eval_metrics = trainer.evaluate()
if eval_output_file and training_args.process_index == 0:
with open(eval_output_file, "w") as f:
json.dump(eval_metrics, f)
# Save per-step losses for equivalence testing
if training_args.do_train and loss_output_file and training_args.process_index == 0:
losses = [log["loss"] for log in trainer.state.log_history if "loss" in log]
with open(loss_output_file, "w") as f:
json.dump(losses, f)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,4 @@
{
"image_processor_type": "ViTImageProcessor",
"size": 30
}

View File

@@ -0,0 +1,87 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Worker script for dataloader worker seed divergence tests.
Verifies that dataloader workers get different random seeds across GPUs,
so that each rank sees different random augmentations.
Run via torchrun or accelerate launch.
"""
import random
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import HfArgumentParser, Trainer, TrainingArguments, set_seed
from transformers.testing_utils import torch_device
def gather_from_all_gpus(tensor, world_size):
gather_list = [torch.zeros_like(tensor) for _ in range(world_size)]
dist.all_gather(gather_list, tensor)
return gather_list
class DummyDataset(Dataset):
def __init__(self):
self.length = 64
def __len__(self):
return self.length
def __getitem__(self, i) -> int:
x = random.random()
y = np.random.random()
z = torch.rand([]).item()
return {"x": torch.tensor([x, y, z])}
class DummyModel(nn.Module):
def __init__(self):
super().__init__()
self.fc = nn.Linear(3, 1)
def forward(self, x):
local_tensor = torch.tensor(x, device=torch_device)
gathered = gather_from_all_gpus(local_tensor, dist.get_world_size())
assert not all(torch.allclose(t, gathered[0]) for t in gathered[1:])
y = self.fc(x)
return (y.mean(), y)
def run_distributed_training(training_args):
set_seed(42)
model = DummyModel()
dataset = DummyDataset()
training_args.max_steps = 3
# dataloader_num_workers must be > 0 to enable worker_init_fn
training_args.dataloader_num_workers = 2
trainer = Trainer(
model,
training_args,
train_dataset=dataset,
)
trainer.train()
if __name__ == "__main__":
parser = HfArgumentParser((TrainingArguments,))
training_args = parser.parse_args_into_dataclasses()[0]
run_distributed_training(training_args)