Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
1314 lines
55 KiB
Python
1314 lines
55 KiB
Python
# Copyright 2018 the HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Core Trainer integration tests: reproducibility, gradient accumulation, gradient checkpointing,
|
|
mixed precision, logging, NEFTune, memory metrics, and end-to-end training.
|
|
"""
|
|
|
|
import math
|
|
import os
|
|
import tempfile
|
|
from functools import partial
|
|
|
|
import datasets
|
|
import numpy as np
|
|
import pytest
|
|
import torch
|
|
from torch import nn
|
|
|
|
from transformers import (
|
|
AutoModelForCausalLM,
|
|
AutoModelForSequenceClassification,
|
|
AutoTokenizer,
|
|
BitsAndBytesConfig,
|
|
DataCollatorForLanguageModeling,
|
|
EarlyStoppingCallback,
|
|
GPT2Config,
|
|
GPT2LMHeadModel,
|
|
IntervalStrategy,
|
|
LlamaConfig,
|
|
LlamaForCausalLM,
|
|
Trainer,
|
|
TrainingArguments,
|
|
default_data_collator,
|
|
logging,
|
|
)
|
|
from transformers.integrations import activate_neftune
|
|
from transformers.loss.loss_utils import ForCausalLMLoss
|
|
from transformers.testing_utils import (
|
|
CaptureLogger,
|
|
LoggingLevel,
|
|
TestCasePlus,
|
|
backend_device_count,
|
|
execute_subprocess_async,
|
|
require_bitsandbytes,
|
|
require_liger_kernel,
|
|
require_non_hpu,
|
|
require_peft,
|
|
require_torch,
|
|
require_torch_accelerator,
|
|
require_torch_bf16,
|
|
require_torch_fp16,
|
|
require_torch_gpu,
|
|
require_torch_multi_accelerator,
|
|
require_torch_non_multi_accelerator,
|
|
require_torch_tf32,
|
|
run_first,
|
|
slow,
|
|
torch_device,
|
|
)
|
|
|
|
from .trainer_test_utils import (
|
|
ATOL,
|
|
PATH_SAMPLE_TEXT,
|
|
RTOL,
|
|
AlmostAccuracy,
|
|
BasicTextGenerationModel,
|
|
RegressionDataset,
|
|
RegressionModel,
|
|
RepeatDataset,
|
|
StoreLossCallback,
|
|
TrainerIntegrationCommon,
|
|
get_dataset,
|
|
get_regression_trainer,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Mixed precision tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerMixedPrecisionTest(TestCasePlus, TrainerIntegrationCommon):
|
|
"""Tests for FP16, BF16, and TF32 mixed precision training."""
|
|
|
|
def setUp(self):
|
|
super().setUp()
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.default_trained_model = (trainer.model.a, trainer.model.b)
|
|
|
|
def check_trained_model(self, model, **kwargs):
|
|
(a, b) = self.default_trained_model
|
|
torch.testing.assert_close(model.a, a, **kwargs)
|
|
torch.testing.assert_close(model.b, b, **kwargs)
|
|
|
|
@require_torch_fp16
|
|
@require_torch_accelerator
|
|
def test_mixed_fp16(self):
|
|
# very basic test
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, fp16=True, logging_steps=1, output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL)
|
|
log_0 = trainer.state.log_history[:-1][0]
|
|
# check that the grads were properly clipped due to the grad scaler. Otherwise, we get huge values
|
|
self.assertEqual(log_0["grad_norm"] < 100, True)
|
|
|
|
@require_torch_bf16
|
|
@require_torch_accelerator
|
|
def test_mixed_bf16(self):
|
|
# very basic test
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL)
|
|
|
|
@require_torch_gpu
|
|
@require_torch_tf32
|
|
def test_tf32(self):
|
|
# very basic test
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, tf32=True, output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# DDP kwargs forwarding tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerDDPKwargsTest(TestCasePlus):
|
|
"""The `ddp_*` TrainingArguments fields must reach DistributedDataParallelKwargs."""
|
|
|
|
def _get_ddp_kwargs(self, **training_args_overrides):
|
|
"""Build a Trainer, run _build_accelerator_args, return the DDP kwargs dict."""
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
args = TrainingArguments(output_dir=tmp_dir, max_steps=1, **training_args_overrides)
|
|
trainer = Trainer(model=RegressionModel(), args=args, train_dataset=RegressionDataset())
|
|
accelerator_args = trainer._build_accelerator_args()
|
|
(handler,) = accelerator_args["kwargs_handlers"]
|
|
return handler
|
|
|
|
def test_ddp_static_graph_true_reaches_accelerator(self):
|
|
"""ddp_static_graph=True is forwarded as static_graph=True to DistributedDataParallelKwargs."""
|
|
handler = self._get_ddp_kwargs(ddp_static_graph=True)
|
|
self.assertTrue(handler.static_graph)
|
|
|
|
def test_ddp_static_graph_false_reaches_accelerator(self):
|
|
"""ddp_static_graph=False is forwarded as static_graph=False."""
|
|
handler = self._get_ddp_kwargs(ddp_static_graph=False)
|
|
self.assertFalse(handler.static_graph)
|
|
|
|
def test_ddp_static_graph_none_preserves_default(self):
|
|
"""ddp_static_graph=None (default) must NOT override DistributedDataParallelKwargs' own default (False).
|
|
|
|
Regression guard: the conditional in _build_accelerator_args must keep static_graph out of ddp_kwargs
|
|
when the flag is unset, otherwise clusters not configured for it would silently switch behavior.
|
|
"""
|
|
handler = self._get_ddp_kwargs() # ddp_static_graph unset
|
|
# DistributedDataParallelKwargs default is False. If our conditional is broken and we always injected
|
|
# the attribute, this would still be False only by coincidence. Cross-check with ddp_static_graph=True
|
|
# (above) that the kwarg IS plumbed when set — together these tests pin both directions.
|
|
self.assertFalse(handler.static_graph)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gradient accumulation tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerGradientAccumulationTest(TestCasePlus, TrainerIntegrationCommon):
|
|
"""Tests for gradient accumulation loss alignment and batch counting."""
|
|
|
|
def test_gradient_accumulation_steps_not_leaked_to_accelerator(self):
|
|
"""The Trainer must not pass its gradient_accumulation_steps to the Accelerator. See #45305."""
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
args = TrainingArguments(
|
|
output_dir=tmp_dir,
|
|
per_device_train_batch_size=4,
|
|
gradient_accumulation_steps=4,
|
|
max_steps=1,
|
|
)
|
|
trainer = Trainer(model=RegressionModel(), args=args, train_dataset=RegressionDataset())
|
|
self.assertEqual(trainer.accelerator.gradient_accumulation_steps, 1)
|
|
|
|
@classmethod
|
|
def setUpClass(cls):
|
|
super().setUpClass()
|
|
cls._ga_model_name = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
|
|
tokenizer = AutoTokenizer.from_pretrained(cls._ga_model_name)
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
dataset = datasets.load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1", split="train[:200]")
|
|
# Filter empty samples to avoid nan losses with small batch sizes
|
|
dataset = dataset.filter(lambda ex: len(ex["text"].strip()) > 0)
|
|
cls._ga_dataset = dataset.map(
|
|
lambda ex: tokenizer(ex["text"], max_length=16, padding="max_length", truncation=True), batched=True
|
|
)
|
|
cls._ga_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
|
|
|
def _check_gradient_accumulation(
|
|
self,
|
|
base_batch_size,
|
|
gas_batch_size,
|
|
gas_steps,
|
|
loss_tolerance,
|
|
model_accepts_loss_kwargs=True,
|
|
compute_loss_func=None,
|
|
):
|
|
"""
|
|
Train twice with the same effective batch (base_batch_size vs gas_batch_size * gas_steps)
|
|
and assert grad norms and losses match.
|
|
"""
|
|
model_name = self._ga_model_name
|
|
args_kwargs = {"logging_steps": 1, "max_steps": 3, "learning_rate": 1e-4, "max_grad_norm": 0.0}
|
|
trainer_kwargs = {"train_dataset": self._ga_dataset, "data_collator": self._ga_data_collator}
|
|
if compute_loss_func is not None:
|
|
trainer_kwargs["compute_loss_func"] = compute_loss_func
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32)
|
|
args = TrainingArguments(
|
|
tmp_dir, per_device_train_batch_size=base_batch_size, gradient_accumulation_steps=1, **args_kwargs
|
|
)
|
|
base_callback = StoreLossCallback()
|
|
trainer = Trainer(model, args, callbacks=[base_callback], **trainer_kwargs)
|
|
if not model_accepts_loss_kwargs:
|
|
trainer.model_accepts_loss_kwargs = False
|
|
trainer.train()
|
|
base_grad_norms = [h["grad_norm"] for h in trainer.state.log_history if "grad_norm" in h]
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32)
|
|
args = TrainingArguments(
|
|
tmp_dir,
|
|
per_device_train_batch_size=gas_batch_size,
|
|
gradient_accumulation_steps=gas_steps,
|
|
**args_kwargs,
|
|
)
|
|
gas_callback = StoreLossCallback()
|
|
trainer = Trainer(model, args, callbacks=[gas_callback], **trainer_kwargs)
|
|
if not model_accepts_loss_kwargs:
|
|
trainer.model_accepts_loss_kwargs = False
|
|
trainer.train()
|
|
gas_grad_norms = [h["grad_norm"] for h in trainer.state.log_history if "grad_norm" in h]
|
|
|
|
for step, (base_gn, gas_gn) in enumerate(zip(base_grad_norms, gas_grad_norms)):
|
|
ratio = gas_gn / base_gn if base_gn > 0 else float("inf")
|
|
self.assertAlmostEqual(
|
|
ratio, 1.0, delta=0.1, msg=f"Step {step}: grad_norm ratio {ratio:.2f} — GAS leak suspected"
|
|
)
|
|
loss_diff = [abs(b - g) for b, g in zip(base_callback.losses, gas_callback.losses)]
|
|
self.assertLess(max(loss_diff), loss_tolerance, f"Loss difference {max(loss_diff)} exceeds {loss_tolerance}")
|
|
|
|
def test_gradient_accumulation_grad_norm_with_num_items_in_batch(self):
|
|
"""
|
|
With model_accepts_loss_kwargs=True the model handles loss averaging via
|
|
num_items_in_batch. Grad norms and losses must match between a large-batch
|
|
baseline and an equivalent GAS run.
|
|
"""
|
|
# Tight tolerance: num_items_in_batch properly averages loss across micro-batches
|
|
self._check_gradient_accumulation(base_batch_size=8, gas_batch_size=1, gas_steps=8, loss_tolerance=0.001)
|
|
self._check_gradient_accumulation(base_batch_size=8, gas_batch_size=4, gas_steps=2, loss_tolerance=0.001)
|
|
|
|
def test_gradient_accumulation_grad_norm_without_num_items_in_batch(self):
|
|
"""
|
|
With model_accepts_loss_kwargs=False the Trainer scales loss by GAS
|
|
itself. Grad norms and losses must still match between a large-batch
|
|
baseline and an equivalent GAS run.
|
|
"""
|
|
# Looser tolerance: without num_items_in_batch each micro-batch is independently
|
|
# mean-reduced, so losses won't match as tightly.
|
|
self._check_gradient_accumulation(
|
|
base_batch_size=8,
|
|
gas_batch_size=4,
|
|
gas_steps=2,
|
|
loss_tolerance=0.1,
|
|
model_accepts_loss_kwargs=False,
|
|
)
|
|
|
|
def test_gradient_accumulation_grad_norm_with_compute_loss_func(self):
|
|
"""
|
|
With a custom compute_loss_func that uses num_items_in_batch, grad norms
|
|
and losses must match between a large-batch baseline and an equivalent GAS run.
|
|
"""
|
|
vocab_size = AutoModelForCausalLM.from_pretrained(self._ga_model_name, dtype=torch.float32).config.vocab_size
|
|
|
|
def compute_loss(logits, labels, vocab_size, num_items_in_batch):
|
|
return ForCausalLMLoss(logits["logits"], labels, vocab_size, num_items_in_batch)
|
|
|
|
# Tight tolerance: compute_loss_func uses num_items_in_batch to properly average loss
|
|
self._check_gradient_accumulation(
|
|
base_batch_size=8,
|
|
gas_batch_size=1,
|
|
gas_steps=8,
|
|
loss_tolerance=0.001,
|
|
compute_loss_func=partial(compute_loss, vocab_size=vocab_size),
|
|
)
|
|
|
|
def test_num_items_in_batch_causal_lm(self):
|
|
"""
|
|
For a causal LM, `_get_num_items_in_batch` must count over `labels[..., 1:]` because
|
|
ForCausalLMLoss shifts labels (position 0 is never a prediction target). When the
|
|
batch already exposes `shift_labels`, that tensor must be used as-is.
|
|
"""
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
model = AutoModelForCausalLM.from_pretrained(self._ga_model_name, dtype=torch.float32)
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=TrainingArguments(output_dir=tmp_dir, per_device_train_batch_size=2),
|
|
train_dataset=self._ga_dataset,
|
|
data_collator=self._ga_data_collator,
|
|
)
|
|
self.assertTrue(trainer._loss_shifts_labels)
|
|
|
|
# batch[0]: 5 valid label positions, 3 padding (-100) → 5 - 1 = 4 after the shift.
|
|
# batch[1]: 8 valid label positions, 0 padding → 8 - 1 = 7 after the shift.
|
|
# Trainer must not count position 0 of each row → expected total = 4 + 7 = 11.
|
|
batch_samples = [
|
|
{"labels": torch.tensor([[1, 2, 3, 4, 5, -100, -100, -100]])},
|
|
{"labels": torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8]])},
|
|
]
|
|
num_items = trainer._get_num_items_in_batch(batch_samples, torch.device("cpu"))
|
|
self.assertEqual(int(num_items), 11)
|
|
|
|
# If the collator already pre-shifts labels (`shift_labels` present), use it as-is and
|
|
# do NOT slice again. Each row here has 4 valid positions → expected total = 8.
|
|
batch_samples = [
|
|
{
|
|
"labels": torch.tensor([[1, 2, 3, 4, 5]]),
|
|
"shift_labels": torch.tensor([[2, 3, 4, 5, -100]]),
|
|
},
|
|
{
|
|
"labels": torch.tensor([[1, 2, 3, 4, 5]]),
|
|
"shift_labels": torch.tensor([[2, 3, 4, 5, -100]]),
|
|
},
|
|
]
|
|
num_items = trainer._get_num_items_in_batch(batch_samples, torch.device("cpu"))
|
|
self.assertEqual(int(num_items), 8)
|
|
|
|
def test_num_items_in_batch_non_causal_lm(self):
|
|
"""For non-causal-LM losses, `_get_num_items_in_batch` must count the full label tensor."""
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
config = LlamaConfig(
|
|
vocab_size=64, hidden_size=16, intermediate_size=32, num_hidden_layers=2, num_attention_heads=2
|
|
)
|
|
# ForTokenClassification → LOSS_MAPPING entry is not ForCausalLMLoss → no shift.
|
|
from transformers import LlamaForTokenClassification
|
|
|
|
model = LlamaForTokenClassification(config)
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=TrainingArguments(output_dir=tmp_dir, per_device_train_batch_size=2),
|
|
)
|
|
self.assertFalse(trainer._loss_shifts_labels)
|
|
|
|
# 5 valid + 8 valid = 13 (no shift).
|
|
batch_samples = [
|
|
{"labels": torch.tensor([[1, 2, 3, 4, 5, -100, -100, -100]])},
|
|
{"labels": torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8]])},
|
|
]
|
|
num_items = trainer._get_num_items_in_batch(batch_samples, torch.device("cpu"))
|
|
self.assertEqual(int(num_items), 13)
|
|
|
|
@require_torch_multi_accelerator
|
|
def test_num_batches_in_training_with_gradient_accumulation(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
for num_train_epochs in [1, 2]:
|
|
for train_len in [123, 120]:
|
|
trainer = get_regression_trainer(
|
|
train_len=train_len,
|
|
per_device_train_batch_size=4,
|
|
gradient_accumulation_steps=5,
|
|
num_train_epochs=num_train_epochs,
|
|
output_dir=tmp_dir,
|
|
)
|
|
|
|
total_batch_samples = []
|
|
|
|
def wrap_get_batch_samples(fn):
|
|
def wrapped_fn(epoch_iterator, num_batches, device):
|
|
self.assertGreater(num_batches, 0)
|
|
batch_samples, num_items_in_batch = fn(epoch_iterator, num_batches, device)
|
|
self.assertEqual(len(batch_samples), num_batches)
|
|
total_batch_samples.append(num_batches)
|
|
return batch_samples, num_items_in_batch
|
|
|
|
return wrapped_fn
|
|
|
|
trainer.get_batch_samples = wrap_get_batch_samples(trainer.get_batch_samples)
|
|
|
|
trainer.train()
|
|
|
|
self.assertEqual(len(trainer.get_train_dataloader()) * num_train_epochs, sum(total_batch_samples))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gradient checkpointing tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerGradientCheckpointingTest(TestCasePlus):
|
|
"""Tests for gradient checkpointing during training."""
|
|
|
|
def test_gradient_checkpointing(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(
|
|
per_device_train_batch_size=1,
|
|
learning_rate=0.1,
|
|
gradient_checkpointing=True,
|
|
output_dir=tmp_dir,
|
|
)
|
|
previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()}
|
|
|
|
trainer.train()
|
|
|
|
# Check if model weights have been updated
|
|
for k, v in trainer.model.named_parameters():
|
|
self.assertFalse(
|
|
torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4),
|
|
f"Model weights for {k} have not been updated",
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# NEFTune tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerNEFTuneTest(TestCasePlus):
|
|
"""Tests for NEFTune noise injection during training."""
|
|
|
|
def test_neftune(self):
|
|
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
|
x = torch.randint(0, 100, (128,))
|
|
train_dataset = RepeatDataset(x)
|
|
|
|
# Trainer without inf/nan filter
|
|
args = TrainingArguments(
|
|
self.get_auto_remove_tmp_dir(),
|
|
learning_rate=1e-9,
|
|
logging_steps=5,
|
|
logging_nan_inf_filter=False,
|
|
neftune_noise_alpha=0.4,
|
|
)
|
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
|
|
|
activate_neftune(trainer.model, trainer.args.neftune_noise_alpha)
|
|
|
|
dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device)
|
|
|
|
emb1 = trainer.model.get_input_embeddings()(dummy_input)
|
|
emb2 = trainer.model.get_input_embeddings()(dummy_input)
|
|
|
|
self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!")
|
|
|
|
# redefine the model
|
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
|
# Trainer without inf/nan filter
|
|
args = TrainingArguments(
|
|
self.get_auto_remove_tmp_dir(),
|
|
learning_rate=1e-9,
|
|
logging_steps=5,
|
|
logging_nan_inf_filter=False,
|
|
neftune_noise_alpha=0.4,
|
|
)
|
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
|
|
|
# Check that it trains without errors
|
|
trainer.train()
|
|
|
|
# Make sure forward pass works fine
|
|
_ = trainer.model(dummy_input)
|
|
self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0)
|
|
|
|
trainer.model.eval()
|
|
|
|
# Check that we get identical embeddings just in case
|
|
emb1 = trainer.model.get_input_embeddings()(dummy_input)
|
|
emb2 = trainer.model.get_input_embeddings()(dummy_input)
|
|
torch.testing.assert_close(emb1, emb2)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Logging tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerLoggingTest(TestCasePlus):
|
|
"""Tests for logging behavior: inf/nan filter and log levels."""
|
|
|
|
def test_logging_inf_nan_filter(self):
|
|
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
|
|
tiny_gpt2 = GPT2LMHeadModel(config)
|
|
x = torch.randint(0, 100, (128,))
|
|
train_dataset = RepeatDataset(x)
|
|
|
|
# Trainer without inf/nan filter
|
|
args = TrainingArguments(
|
|
self.get_auto_remove_tmp_dir(),
|
|
learning_rate=1e9,
|
|
logging_steps=5,
|
|
logging_nan_inf_filter=False,
|
|
)
|
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
|
trainer.train()
|
|
log_history_no_filter = trainer.state.log_history
|
|
|
|
# Trainer with inf/nan filter
|
|
args = TrainingArguments(
|
|
self.get_auto_remove_tmp_dir(),
|
|
learning_rate=1e9,
|
|
logging_steps=5,
|
|
logging_nan_inf_filter=True,
|
|
)
|
|
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
|
|
trainer.train()
|
|
log_history_filter = trainer.state.log_history
|
|
|
|
def is_any_loss_nan_or_inf(log_history):
|
|
losses = [l["loss"] for l in log_history[:-1]]
|
|
return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses)
|
|
|
|
self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter))
|
|
self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))
|
|
|
|
def test_log_level(self):
|
|
# testing only --log_level (--log_level_replica requires multiple gpus and DDP and is tested elsewhere)
|
|
logger = logging.get_logger()
|
|
log_info_string = "Running training"
|
|
|
|
# test with the default log_level - should be the same as before and thus we test depending on is_info
|
|
is_info = logging.get_verbosity() <= 20
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
with CaptureLogger(logger) as cl:
|
|
trainer = get_regression_trainer(output_dir=tmp_dir)
|
|
trainer.train()
|
|
if is_info:
|
|
self.assertIn(log_info_string, cl.out)
|
|
else:
|
|
self.assertNotIn(log_info_string, cl.out)
|
|
|
|
with LoggingLevel(logging.INFO):
|
|
# test with low log_level - lower than info
|
|
with CaptureLogger(logger) as cl:
|
|
trainer = get_regression_trainer(log_level="debug", output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.assertIn(log_info_string, cl.out)
|
|
|
|
with LoggingLevel(logging.INFO):
|
|
# test with high log_level - should be quiet
|
|
with CaptureLogger(logger) as cl:
|
|
trainer = get_regression_trainer(log_level="error", output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.assertNotIn(log_info_string, cl.out)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Metrics tests (FLOS, memory, input tokens)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerMetricsTest(TestCasePlus):
|
|
"""Tests for FLOS extraction, memory metrics, and input token counting."""
|
|
|
|
def test_flos_extraction(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
|
|
|
|
def assert_flos_extraction(trainer, wrapped_model_to_check):
|
|
self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check))
|
|
self.assertGreaterEqual(
|
|
getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0
|
|
)
|
|
|
|
# with plain model
|
|
assert_flos_extraction(trainer, trainer.model)
|
|
|
|
# with enforced DataParallel
|
|
assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
|
|
|
|
trainer.train()
|
|
self.assertTrue(isinstance(trainer.state.total_flos, float))
|
|
|
|
def check_mem_metrics(self, trainer, check_func):
|
|
metrics = trainer.train().metrics
|
|
check_func("init_mem_cpu_alloc_delta", metrics)
|
|
check_func("train_mem_cpu_alloc_delta", metrics)
|
|
if backend_device_count(torch_device) > 0:
|
|
check_func("init_mem_gpu_alloc_delta", metrics)
|
|
check_func("train_mem_gpu_alloc_delta", metrics)
|
|
|
|
metrics = trainer.evaluate()
|
|
check_func("eval_mem_cpu_alloc_delta", metrics)
|
|
if backend_device_count(torch_device) > 0:
|
|
check_func("eval_mem_gpu_alloc_delta", metrics)
|
|
|
|
metrics = trainer.predict(RegressionDataset()).metrics
|
|
check_func("test_mem_cpu_alloc_delta", metrics)
|
|
if backend_device_count(torch_device) > 0:
|
|
check_func("test_mem_gpu_alloc_delta", metrics)
|
|
|
|
def test_mem_metrics(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
# with mem metrics enabled
|
|
trainer = get_regression_trainer(skip_memory_metrics=False, output_dir=tmp_dir)
|
|
self.check_mem_metrics(trainer, self.assertIn)
|
|
|
|
# with mem metrics disabled
|
|
trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir)
|
|
self.check_mem_metrics(trainer, self.assertNotIn)
|
|
|
|
def test_include_num_input_tokens_seen(self):
|
|
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
|
tokenizer.pad_token = "[PAD]"
|
|
model.config.pad_token_id = tokenizer.pad_token_id
|
|
|
|
sentences = ["This is a short sentence.", "This is a much longer sentence that will require padding."]
|
|
labels = torch.tensor([0, 1])
|
|
|
|
# 1. Test with attention_mask
|
|
tokenized_dataset_with_mask = tokenizer(sentences, truncation=True, padding="longest", return_tensors="pt")
|
|
tokenized_dataset_with_mask["labels"] = labels
|
|
dataset_with_mask = datasets.Dataset.from_dict(tokenized_dataset_with_mask)
|
|
|
|
# 2. Test without attention_mask
|
|
tokenized_dataset_no_mask = {k: v for k, v in tokenized_dataset_with_mask.items() if k != "attention_mask"}
|
|
dataset_no_mask = datasets.Dataset.from_dict(tokenized_dataset_no_mask)
|
|
|
|
# 3. Test with no padding information
|
|
tokenizer_no_pad = AutoTokenizer.from_pretrained("bert-base-cased")
|
|
tokenizer_no_pad.pad_token = None
|
|
|
|
data_collator = default_data_collator
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
# Test case 1: "non_padding" with attention_mask
|
|
args = TrainingArguments(
|
|
output_dir=tmp_dir,
|
|
include_num_input_tokens_seen="non_padding",
|
|
per_device_train_batch_size=2,
|
|
max_steps=1,
|
|
)
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=args,
|
|
train_dataset=dataset_with_mask,
|
|
data_collator=data_collator,
|
|
processing_class=tokenizer,
|
|
)
|
|
trainer.train()
|
|
attention_mask = tokenized_dataset_with_mask["attention_mask"]
|
|
non_padded_tokens_with_mask = attention_mask.sum().item()
|
|
self.assertEqual(trainer.state.num_input_tokens_seen, non_padded_tokens_with_mask)
|
|
|
|
# Test case 2: "non_padding" without attention_mask (fallback to pad_token_id)
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=args,
|
|
train_dataset=dataset_no_mask,
|
|
data_collator=data_collator,
|
|
processing_class=tokenizer,
|
|
)
|
|
trainer.train()
|
|
input_ids = tokenized_dataset_with_mask["input_ids"] # use original to compute expected
|
|
non_padded_tokens_no_mask = (input_ids != tokenizer.pad_token_id).sum().item()
|
|
self.assertEqual(trainer.state.num_input_tokens_seen, non_padded_tokens_no_mask)
|
|
|
|
# Test case 3: "non_padding" with no padding info (fallback to numel)
|
|
with self.assertLogs("transformers.trainer", level="WARNING") as cm:
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=args,
|
|
train_dataset=dataset_no_mask, # still has input_ids
|
|
data_collator=data_collator,
|
|
processing_class=tokenizer_no_pad, # tokenizer without pad token
|
|
)
|
|
trainer.train()
|
|
self.assertTrue(
|
|
any("Could not determine method to count non-padding tokens" in log for log in cm.output)
|
|
)
|
|
total_tokens = input_ids.numel()
|
|
self.assertEqual(trainer.state.num_input_tokens_seen, total_tokens)
|
|
|
|
# Test case 4: "all"
|
|
args.include_num_input_tokens_seen = "all"
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=args,
|
|
train_dataset=dataset_with_mask,
|
|
data_collator=data_collator,
|
|
processing_class=tokenizer,
|
|
)
|
|
trainer.train()
|
|
self.assertEqual(trainer.state.num_input_tokens_seen, total_tokens)
|
|
|
|
# Test case 5: True (backward compatibility)
|
|
args.include_num_input_tokens_seen = True
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=args,
|
|
train_dataset=dataset_with_mask,
|
|
data_collator=data_collator,
|
|
processing_class=tokenizer,
|
|
)
|
|
trainer.train()
|
|
self.assertEqual(trainer.state.num_input_tokens_seen, total_tokens)
|
|
|
|
def test_get_num_trainable_parameters(self):
|
|
model = nn.Sequential(nn.Linear(128, 64), nn.Linear(64, 32))
|
|
# in_features * out_features + bias
|
|
layer_1 = 128 * 64 + 64
|
|
layer_2 = 64 * 32 + 32
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir))
|
|
self.assertEqual(trainer.get_num_trainable_parameters(), layer_1 + layer_2)
|
|
# Freeze the last layer
|
|
for param in model[-1].parameters():
|
|
param.requires_grad = False
|
|
self.assertEqual(trainer.get_num_trainable_parameters(), layer_1)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Step counting and training loss tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerStepCountingTest(TestCasePlus):
|
|
"""Tests for training loss computation, step counting, and epoch handling."""
|
|
|
|
def setUp(self):
|
|
super().setUp()
|
|
args = TrainingArguments("..")
|
|
self.n_epochs = args.num_train_epochs
|
|
self.batch_size = args.train_batch_size
|
|
|
|
def test_training_loss(self):
|
|
n_gpus = max(1, backend_device_count(torch_device))
|
|
|
|
# With even logs
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus), output_dir=tmp_dir)
|
|
trainer.train()
|
|
log_history = trainer.state.log_history
|
|
|
|
losses = [log["loss"] for log in log_history if "loss" in log]
|
|
train_loss = log_history[-1]["train_loss"]
|
|
self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4)
|
|
|
|
# With uneven logs
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(logging_steps=5, output_dir=tmp_dir)
|
|
trainer.train()
|
|
log_history = trainer.state.log_history
|
|
|
|
# Training loss should be the same as before
|
|
new_train_loss = log_history[-1]["train_loss"]
|
|
self.assertAlmostEqual(train_loss, new_train_loss, places=4)
|
|
|
|
def test_number_of_steps_in_training(self):
|
|
# Regular training has n_epochs * len(train_dl) steps
|
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
|
trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
|
|
train_output = trainer.train()
|
|
self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
|
|
|
|
# Check passing num_train_epochs works (and a float version too):
|
|
trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5, output_dir=tmp_dir)
|
|
train_output = trainer.train()
|
|
self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))
|
|
|
|
# If we pass a max_steps, num_train_epochs is ignored
|
|
trainer = get_regression_trainer(learning_rate=0.1, max_steps=10, output_dir=tmp_dir)
|
|
train_output = trainer.train()
|
|
self.assertEqual(train_output.global_step, 10)
|
|
|
|
def test_num_train_epochs_in_training(self):
|
|
# len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given.
|
|
# It should give 1 update step for each epoch.
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(
|
|
max_steps=3,
|
|
train_len=64,
|
|
per_device_train_batch_size=16,
|
|
gradient_accumulation_steps=5,
|
|
output_dir=tmp_dir,
|
|
)
|
|
train_output = trainer.train()
|
|
self.assertEqual(train_output.global_step, 3)
|
|
|
|
# Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
|
|
# len(train_dl) < gradient_accumulation_steps.
|
|
trainer = get_regression_trainer(
|
|
train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5, output_dir=tmp_dir
|
|
)
|
|
train_output = trainer.train()
|
|
self.assertEqual(train_output.global_step, int(self.n_epochs))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Reproducibility tests (pre-run training to check determinism across configs)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
|
|
"""
|
|
Only tests that want to tap into the auto-pre-run 2 trainings:
|
|
- self.default_trained_model
|
|
- self.alternate_trained_model
|
|
directly, or via check_trained_model
|
|
"""
|
|
|
|
def setUp(self):
|
|
super().setUp()
|
|
args = TrainingArguments("..")
|
|
self.n_epochs = args.num_train_epochs
|
|
self.batch_size = args.train_batch_size
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.default_trained_model = (trainer.model.a, trainer.model.b)
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.alternate_trained_model = (trainer.model.a, trainer.model.b)
|
|
|
|
def check_trained_model(self, model, alternate_seed=False, **kwargs):
|
|
(a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
|
|
torch.testing.assert_close(model.a, a, **kwargs)
|
|
torch.testing.assert_close(model.b, b, **kwargs)
|
|
|
|
def test_reproducible_training(self):
|
|
# Checks that training worked, model trained and seed made a reproducible training.
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model)
|
|
|
|
# Checks that a different seed gets different (reproducible) results.
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir)
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model, alternate_seed=True)
|
|
|
|
def test_trainer_with_datasets(self):
|
|
np.random.seed(42)
|
|
x = np.random.normal(size=(64,)).astype(np.float32)
|
|
y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,)).astype(np.float32)
|
|
train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})
|
|
|
|
# Base training. Should have the same results as test_reproducible_training
|
|
model = RegressionModel()
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
args = TrainingArguments(tmp_dir, learning_rate=0.1)
|
|
trainer = Trainer(model, args, train_dataset=train_dataset)
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model)
|
|
|
|
# Can return tensors.
|
|
train_dataset.set_format(type="torch", dtype=torch.float32)
|
|
model = RegressionModel()
|
|
trainer = Trainer(model, args, train_dataset=train_dataset)
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model)
|
|
|
|
# Adding one column not used by the model should have no impact
|
|
z = np.random.normal(size=(64,)).astype(np.float32)
|
|
train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
|
|
model = RegressionModel()
|
|
trainer = Trainer(model, args, train_dataset=train_dataset)
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model)
|
|
|
|
def test_model_init(self):
|
|
train_dataset = RegressionDataset()
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
args = TrainingArguments(tmp_dir, learning_rate=0.1)
|
|
trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model)
|
|
|
|
# Re-training should restart from scratch, thus lead the same results.
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model)
|
|
|
|
# Re-training should restart from scratch, thus lead the same results and new seed should be used.
|
|
trainer.args.seed = 314
|
|
trainer.train()
|
|
self.check_trained_model(trainer.model, alternate_seed=True)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Torch compile tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerTorchCompileTest(TestCasePlus):
|
|
@pytest.mark.torch_compile_test
|
|
def test_torch_compile_loss_func_compatibility(self):
|
|
config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
|
|
tiny_llama = LlamaForCausalLM(config)
|
|
|
|
x = torch.randint(0, 100, (128,))
|
|
train_dataset = RepeatDataset(x)
|
|
|
|
args = TrainingArguments(
|
|
self.get_auto_remove_tmp_dir(),
|
|
per_device_train_batch_size=2,
|
|
torch_compile=True,
|
|
max_steps=1, # compile happens on the first step
|
|
)
|
|
trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset) # noqa
|
|
trainer.train()
|
|
|
|
@require_peft
|
|
@require_bitsandbytes
|
|
@pytest.mark.torch_compile_test
|
|
def test_bnb_compile(self):
|
|
from peft import LoraConfig, get_peft_model
|
|
|
|
# Simply tests if initializing a Trainer with a PEFT + compiled model works out of the box
|
|
# QLoRA + torch compile is not really supported yet, but we should at least support the model
|
|
# loading and let torch throw the
|
|
tiny_model = AutoModelForCausalLM.from_pretrained(
|
|
"hf-internal-testing/tiny-random-LlamaForCausalLM",
|
|
quantization_config=BitsAndBytesConfig(load_in_4bit=True),
|
|
)
|
|
|
|
peft_config = LoraConfig(
|
|
r=8,
|
|
lora_alpha=32,
|
|
target_modules=["q_proj", "k_proj", "v_proj"],
|
|
lora_dropout=0.05,
|
|
bias="none",
|
|
task_type="CAUSAL_LM",
|
|
)
|
|
tiny_model = get_peft_model(tiny_model, peft_config)
|
|
|
|
tiny_model = torch.compile(tiny_model)
|
|
|
|
x = torch.randint(0, 100, (128,))
|
|
train_dataset = RepeatDataset(x)
|
|
|
|
args = TrainingArguments(
|
|
self.get_auto_remove_tmp_dir(),
|
|
learning_rate=1e-9,
|
|
logging_steps=5,
|
|
)
|
|
with self.assertRaises(ValueError):
|
|
_ = Trainer(tiny_model, args, train_dataset=train_dataset) # noqa
|
|
|
|
@require_torch_accelerator
|
|
@pytest.mark.torch_compile_test
|
|
def test_torch_compile_train(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(output_dir=tmp_dir)
|
|
metrics = trainer.train()
|
|
original_train_loss = metrics.training_loss
|
|
|
|
trainer = get_regression_trainer(torch_compile=True, output_dir=tmp_dir)
|
|
metrics = trainer.train()
|
|
self.assertAlmostEqual(metrics.training_loss, original_train_loss)
|
|
|
|
@require_torch_accelerator
|
|
@pytest.mark.torch_compile_test
|
|
def test_torch_compile_eval(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(output_dir=tmp_dir)
|
|
metrics = trainer.evaluate()
|
|
original_eval_loss = metrics["eval_loss"]
|
|
|
|
trainer = get_regression_trainer(torch_compile=True, output_dir=tmp_dir)
|
|
metrics = trainer.evaluate()
|
|
|
|
self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss, delta=1e-6)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Early stopping tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerEarlyStoppingTest(TestCasePlus):
|
|
def test_early_stopping_callback(self):
|
|
# early stopping stops training before num_training_epochs
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(
|
|
output_dir=tmp_dir,
|
|
num_train_epochs=20,
|
|
gradient_accumulation_steps=1,
|
|
per_device_train_batch_size=16,
|
|
load_best_model_at_end=True,
|
|
eval_strategy=IntervalStrategy.EPOCH,
|
|
save_strategy=IntervalStrategy.EPOCH,
|
|
compute_metrics=AlmostAccuracy(),
|
|
metric_for_best_model="accuracy",
|
|
)
|
|
trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
|
|
train_output = trainer.train()
|
|
self.assertLess(train_output.global_step, 20 * 64 / 16)
|
|
|
|
# Invalid inputs to trainer with early stopping callback result in assertion error
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(
|
|
output_dir=tmp_dir,
|
|
num_train_epochs=20,
|
|
gradient_accumulation_steps=1,
|
|
per_device_train_batch_size=16,
|
|
eval_strategy=IntervalStrategy.EPOCH,
|
|
compute_metrics=AlmostAccuracy(),
|
|
metric_for_best_model="accuracy",
|
|
)
|
|
trainer.add_callback(EarlyStoppingCallback(1))
|
|
self.assertEqual(trainer.state.global_step, 0)
|
|
try:
|
|
trainer.train()
|
|
except AssertionError:
|
|
self.assertEqual(trainer.state.global_step, 0)
|
|
|
|
# even if load_best_model_at_end is False, `best_model_checkpoint` should be set
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(
|
|
output_dir=tmp_dir,
|
|
num_train_epochs=20,
|
|
gradient_accumulation_steps=1,
|
|
per_device_train_batch_size=16,
|
|
load_best_model_at_end=False,
|
|
eval_strategy=IntervalStrategy.EPOCH,
|
|
save_strategy=IntervalStrategy.EPOCH,
|
|
compute_metrics=AlmostAccuracy(),
|
|
metric_for_best_model="accuracy",
|
|
)
|
|
trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
|
|
train_output = trainer.train()
|
|
self.assertIsNotNone(trainer.state.best_model_checkpoint)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Liger kernel tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerLigerKernelTest(TestCasePlus):
|
|
@require_liger_kernel
|
|
def test_use_liger_kernel_patching(self):
|
|
import importlib
|
|
|
|
from liger_kernel.transformers import liger_rotary_pos_emb
|
|
|
|
from transformers.integrations.liger import apply_liger_kernel
|
|
from transformers.models.llama import modeling_llama
|
|
|
|
config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
|
|
tiny_llama = LlamaForCausalLM(config)
|
|
|
|
# Spot check that modeling code and model instance variables are not yet patched
|
|
self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
|
|
self.assertFalse("LigerRMSNorm" in tiny_llama.model.norm.__repr__())
|
|
|
|
apply_liger_kernel(tiny_llama, {})
|
|
|
|
# Spot check that modeling code and model instance variables are patched
|
|
self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
|
|
self.assertTrue("LigerRMSNorm" in tiny_llama.model.norm.__repr__())
|
|
|
|
# Restore the original module to avoid leaking monkey patches to other tests
|
|
importlib.reload(modeling_llama)
|
|
|
|
@require_liger_kernel
|
|
def test_use_liger_kernel_custom_config_patching(self):
|
|
import importlib
|
|
|
|
from liger_kernel.transformers import LigerRMSNorm
|
|
|
|
from transformers.integrations.liger import apply_liger_kernel
|
|
from transformers.models.llama import modeling_llama
|
|
|
|
config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
|
|
tiny_llama = LlamaForCausalLM(config)
|
|
|
|
apply_liger_kernel(tiny_llama, {"rms_norm": False})
|
|
|
|
# Check that the RMSNorm kernel is not applied as specified in the config
|
|
self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
|
|
|
|
# Restore the original module to avoid leaking monkey patches to other tests
|
|
importlib.reload(modeling_llama)
|
|
|
|
@require_liger_kernel
|
|
@require_torch_accelerator
|
|
@require_torch_non_multi_accelerator # Don't work with DP
|
|
def test_use_liger_kernel_trainer(self):
|
|
import importlib
|
|
|
|
from transformers.models.llama import modeling_llama
|
|
|
|
config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
|
|
tiny_llama = LlamaForCausalLM(config)
|
|
x = torch.randint(0, 100, (128,))
|
|
train_dataset = RepeatDataset(x)
|
|
args = TrainingArguments(
|
|
self.get_auto_remove_tmp_dir(),
|
|
learning_rate=1e-2,
|
|
logging_steps=5,
|
|
max_steps=20,
|
|
use_liger_kernel=True,
|
|
)
|
|
Trainer(tiny_llama, args, train_dataset=train_dataset).train()
|
|
|
|
# Restore the original module to avoid leaking monkey patches to other tests
|
|
importlib.reload(modeling_llama)
|
|
|
|
@require_liger_kernel
|
|
@require_torch_accelerator
|
|
@require_torch_non_multi_accelerator # don't work with DP
|
|
def test_use_liger_kernel_custom_config_trainer(self):
|
|
import importlib
|
|
|
|
from transformers.models.llama import modeling_llama
|
|
|
|
config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
|
|
tiny_llama = LlamaForCausalLM(config)
|
|
x = torch.randint(0, 100, (128,))
|
|
train_dataset = RepeatDataset(x)
|
|
args = TrainingArguments(
|
|
self.get_auto_remove_tmp_dir(),
|
|
learning_rate=1e-2,
|
|
logging_steps=5,
|
|
max_steps=20,
|
|
use_liger_kernel=True,
|
|
liger_kernel_config={"rms_norm": False, "cross_entropy": True, "fused_linear_cross_entropy": False},
|
|
)
|
|
Trainer(tiny_llama, args, train_dataset=train_dataset).train()
|
|
|
|
# Restore the original module to avoid leaking monkey patches to other tests
|
|
importlib.reload(modeling_llama)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Miscellaneous integration tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerIntegrationTest(TestCasePlus):
|
|
"""Integration tests: compatibility, and e2e."""
|
|
|
|
@slow
|
|
@run_first
|
|
@require_non_hpu
|
|
@require_torch_multi_accelerator
|
|
def test_end_to_end_example(self):
|
|
# Tests that `translation.py` will run without issues
|
|
script_path = os.path.abspath(
|
|
os.path.join(
|
|
os.path.dirname(__file__), "..", "..", "examples", "pytorch", "translation", "run_translation.py"
|
|
)
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
command = [
|
|
"accelerate",
|
|
"launch",
|
|
script_path,
|
|
"--model_name_or_path",
|
|
"google-t5/t5-small",
|
|
"--per_device_train_batch_size",
|
|
"1",
|
|
"--output_dir",
|
|
tmpdir,
|
|
"--do_train",
|
|
"--max_train_samples",
|
|
"64",
|
|
"--num_train_epochs",
|
|
"1",
|
|
"--dataset_name",
|
|
"wmt16",
|
|
"--dataset_config",
|
|
"ro-en",
|
|
"--source_lang",
|
|
"en",
|
|
"--target_lang",
|
|
"ro",
|
|
"--do_predict",
|
|
"--max_predict_samples",
|
|
"64",
|
|
"--predict_with_generate",
|
|
"--ddp_timeout",
|
|
"60",
|
|
]
|
|
execute_subprocess_async(command)
|
|
# successful return here == success - any errors would have caused an error or a timeout in the sub-call
|
|
|
|
def test_special_token_alignment(self):
|
|
"""
|
|
Tests that special token changes in the tokenizer result in model configs updates when using the trainer, to
|
|
ensure special tokens are aligned across configs
|
|
"""
|
|
|
|
model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
|
|
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
|
|
|
|
# add new special tokens to tokenizer, so we can test that trainer aligns the model configs with the tokenizer
|
|
tokenizer.eos_token = "<|im_end|>"
|
|
tokenizer.pad_token = "<|im_end|>"
|
|
tokenizer.bos_token = "<|im_start|>"
|
|
tokenizer.add_special_tokens({"additional_special_tokens": ["<|im_end|>", "<|im_start|>"]})
|
|
|
|
# the model needs to have its embedding layer resized accordingly
|
|
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
|
|
|
|
# create a random dataset from the **new** vocab size
|
|
x = torch.randint(0, len(tokenizer), (64,))
|
|
dataset = RepeatDataset(x, length=2)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
training_args = TrainingArguments(output_dir=tmpdir, max_steps=1, per_device_train_batch_size=1)
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
processing_class=tokenizer,
|
|
train_dataset=dataset,
|
|
)
|
|
|
|
# We haven't started training -> not yet aligned
|
|
self.assertNotEqual(trainer.model.config.eos_token_id, tokenizer.eos_token_id)
|
|
self.assertNotEqual(trainer.model.config.pad_token_id, tokenizer.pad_token_id)
|
|
self.assertNotEqual(trainer.model.config.bos_token_id, tokenizer.bos_token_id)
|
|
|
|
trainer.train()
|
|
|
|
# Must be aligned as soon as we start training
|
|
self.assertEqual(trainer.model.config.eos_token_id, tokenizer.eos_token_id)
|
|
self.assertEqual(trainer.model.config.pad_token_id, tokenizer.pad_token_id)
|
|
self.assertEqual(trainer.model.config.bos_token_id, tokenizer.bos_token_id)
|
|
|
|
def test_trainer_works_without_model_config(self):
|
|
"""
|
|
Tests that models without a `config` parameter can still be trained.
|
|
This is useful for preserving compatibility with third parties that train different models using the
|
|
transformers Trainer.
|
|
|
|
If this test fails, it doesn't imply that there's issues with transformers, but perhaps with third
|
|
parties.
|
|
"""
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
|
|
model = BasicTextGenerationModel(vocab_size=tokenizer.vocab_size, hidden_size=32)
|
|
|
|
train_dataset = get_dataset(PATH_SAMPLE_TEXT, tokenizer, 100)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
training_args = TrainingArguments(
|
|
output_dir=tmpdir, max_steps=5, per_device_train_batch_size=1, use_cpu=True
|
|
)
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
processing_class=tokenizer,
|
|
train_dataset=train_dataset,
|
|
)
|
|
trainer.train()
|
|
|
|
def test_training_arguments_are_left_untouched(self):
|
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
|
trainer = get_regression_trainer(output_dir=tmp_dir)
|
|
trainer.train()
|
|
args = TrainingArguments(tmp_dir)
|
|
dict1, dict2 = args.to_dict(), trainer.args.to_dict()
|
|
for key in dict1:
|
|
self.assertEqual(dict1[key], dict2[key])
|
|
|
|
def test_double_train_wrap_once(self):
|
|
# test that we don't wrap the model more than once
|
|
# since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
|
|
# example DataParallel(DataParallel(model))
|
|
|
|
trainer = get_regression_trainer(output_dir=self.get_auto_remove_tmp_dir())
|
|
trainer.train()
|
|
model_wrapped_before = trainer.model_wrapped
|
|
trainer.train()
|
|
model_wrapped_after = trainer.model_wrapped
|
|
self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice")
|