Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
520 lines
22 KiB
Python
520 lines
22 KiB
Python
# Copyright 2018 the HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Trainer evaluation and prediction tests: evaluate, predict, batched metrics, dynamic shapes,
|
|
iterable datasets, early stopping, FP16/BF16 full eval memory, torch.compile, and MRPC/LM eval.
|
|
"""
|
|
|
|
import gc
|
|
import tempfile
|
|
|
|
import numpy as np
|
|
|
|
from transformers import (
|
|
AutoTokenizer,
|
|
TrainingArguments,
|
|
is_torch_available,
|
|
)
|
|
from transformers.testing_utils import (
|
|
TestCasePlus,
|
|
backend_device_count,
|
|
get_tests_dir,
|
|
require_torch,
|
|
require_torch_accelerator,
|
|
require_torch_bf16,
|
|
require_torch_fp16,
|
|
slow,
|
|
torch_device,
|
|
)
|
|
|
|
from .trainer_test_utils import (
|
|
PATH_SAMPLE_TEXT,
|
|
AlmostAccuracy,
|
|
AlmostAccuracyBatched,
|
|
RegressionDataset,
|
|
RegressionDictModel,
|
|
TrainerIntegrationCommon,
|
|
get_dataset,
|
|
get_regression_trainer,
|
|
)
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
from transformers import (
|
|
AutoModelForCausalLM,
|
|
AutoModelForSequenceClassification,
|
|
GlueDataset,
|
|
GlueDataTrainingArguments,
|
|
Trainer,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Core evaluate / predict tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerEvaluationTest(TestCasePlus, TrainerIntegrationCommon):
|
|
def setUp(self):
|
|
super().setUp()
|
|
args = TrainingArguments("..")
|
|
self.n_epochs = args.num_train_epochs
|
|
self.batch_size = args.train_batch_size
|
|
|
|
def test_evaluate(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir)
|
|
results = trainer.evaluate()
|
|
|
|
x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
|
|
pred = 1.5 * x + 2.5
|
|
expected_loss = ((pred - y) ** 2).mean()
|
|
self.assertAlmostEqual(results["eval_loss"], expected_loss)
|
|
expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
|
|
self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
|
|
|
|
# With a number of elements not a round multiple of the batch size
|
|
trainer = get_regression_trainer(
|
|
a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir
|
|
)
|
|
results = trainer.evaluate()
|
|
|
|
x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
|
|
pred = 1.5 * x + 2.5
|
|
expected_loss = ((pred - y) ** 2).mean()
|
|
self.assertAlmostEqual(results["eval_loss"], expected_loss)
|
|
expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
|
|
self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
|
|
|
|
# With logits preprocess
|
|
trainer = get_regression_trainer(
|
|
a=1.5,
|
|
b=2.5,
|
|
compute_metrics=AlmostAccuracy(),
|
|
preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
|
|
output_dir=tmp_dir,
|
|
)
|
|
results = trainer.evaluate()
|
|
|
|
x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
|
|
pred = 1.5 * x + 2.5
|
|
expected_loss = ((pred - y) ** 2).mean()
|
|
self.assertAlmostEqual(results["eval_loss"], expected_loss)
|
|
expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
|
|
self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
|
|
|
|
def test_predict(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(a=1.5, b=2.5, output_dir=tmp_dir)
|
|
preds = trainer.predict(trainer.eval_dataset).predictions
|
|
x = trainer.eval_dataset.x
|
|
self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
|
|
|
|
# With a number of elements not a round multiple of the batch size
|
|
trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, output_dir=tmp_dir)
|
|
preds = trainer.predict(trainer.eval_dataset).predictions
|
|
x = trainer.eval_dataset.x
|
|
self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
|
|
|
|
# With more than one output of the model
|
|
trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, output_dir=tmp_dir)
|
|
preds = trainer.predict(trainer.eval_dataset).predictions
|
|
x = trainer.eval_dataset.x
|
|
self.assertEqual(len(preds), 2)
|
|
self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
|
|
self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
|
|
|
|
# With more than one output/label of the model
|
|
trainer = get_regression_trainer(
|
|
a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], output_dir=tmp_dir
|
|
)
|
|
outputs = trainer.predict(trainer.eval_dataset)
|
|
preds = outputs.predictions
|
|
labels = outputs.label_ids
|
|
x = trainer.eval_dataset.x
|
|
self.assertEqual(len(preds), 2)
|
|
self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
|
|
self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
|
|
self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
|
|
self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
|
|
|
|
def test_train_and_predict_loss_parity(self):
|
|
"""
|
|
Tests that the loss computed during a training_step is the same as the one computed during prediction_step.
|
|
for the same inputs
|
|
"""
|
|
model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
|
|
# Create a dummy batch of inputs
|
|
inputs = {}
|
|
inputs["input_ids"] = []
|
|
for row_ind in range(4):
|
|
seq_len = torch.randint(32, 64, (1,)).item()
|
|
x = torch.randint(1, 100, (seq_len,))
|
|
inputs["input_ids"].append(x)
|
|
inputs["input_ids"] = torch.nn.utils.rnn.pad_sequence(inputs["input_ids"], batch_first=True, padding_value=0)
|
|
inputs["labels"] = inputs["input_ids"].clone()
|
|
inputs["labels"][inputs["input_ids"] == 0] = -100
|
|
num_items_in_batch = inputs["labels"][..., 1:].ne(-100).sum().item()
|
|
|
|
def custom_loss_func(outputs, labels, num_items_in_batch=None):
|
|
logits = outputs["logits"]
|
|
loss_fct = torch.nn.CrossEntropyLoss()
|
|
loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
|
|
if num_items_in_batch is not None:
|
|
return loss / num_items_in_batch # multiply by number of items to get the sum
|
|
return loss
|
|
|
|
trainer = Trainer(model, train_dataset=None, compute_loss_func=custom_loss_func)
|
|
|
|
# creating log history of trainer, results don't matter
|
|
train_loss = trainer.training_step(model, inputs, num_items_in_batch)
|
|
predict_loss = trainer.prediction_step(model, inputs, prediction_loss_only=True)[0]
|
|
|
|
torch.testing.assert_close(train_loss, predict_loss, atol=1e-6, rtol=0)
|
|
|
|
def test_eval_use_gather_object(self):
|
|
train_dataset = RegressionDataset()
|
|
eval_dataset = RegressionDataset()
|
|
model = RegressionDictModel()
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
args = TrainingArguments(tmp_dir, eval_use_gather_object=True)
|
|
trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
|
|
trainer.train()
|
|
_ = trainer.evaluate()
|
|
_ = trainer.predict(eval_dataset)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Batch eval metrics tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerBatchEvalMetricsTest(TestCasePlus, TrainerIntegrationCommon):
|
|
def setUp(self):
|
|
super().setUp()
|
|
args = TrainingArguments("..")
|
|
self.n_epochs = args.num_train_epochs
|
|
self.batch_size = args.train_batch_size
|
|
|
|
def test_evaluate_with_batch_eval_metrics(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(
|
|
a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir
|
|
)
|
|
results = trainer.evaluate()
|
|
|
|
x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
|
|
pred = 1.5 * x + 2.5
|
|
expected_loss = ((pred - y) ** 2).mean()
|
|
self.assertAlmostEqual(results["eval_loss"], expected_loss)
|
|
expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
|
|
self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
|
|
|
|
# With a number of elements not a round multiple of the batch size
|
|
trainer = get_regression_trainer(
|
|
a=1.5,
|
|
b=2.5,
|
|
eval_len=66,
|
|
compute_metrics=AlmostAccuracyBatched(),
|
|
batch_eval_metrics=True,
|
|
output_dir=tmp_dir,
|
|
)
|
|
results = trainer.evaluate()
|
|
|
|
x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
|
|
pred = 1.5 * x + 2.5
|
|
expected_loss = ((pred - y) ** 2).mean()
|
|
self.assertAlmostEqual(results["eval_loss"], expected_loss)
|
|
expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
|
|
self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
|
|
|
|
# With logits preprocess
|
|
trainer = get_regression_trainer(
|
|
a=1.5,
|
|
b=2.5,
|
|
compute_metrics=AlmostAccuracyBatched(),
|
|
batch_eval_metrics=True,
|
|
preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
|
|
output_dir=tmp_dir,
|
|
)
|
|
results = trainer.evaluate()
|
|
|
|
x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
|
|
pred = 1.5 * x + 2.5
|
|
expected_loss = ((pred - y) ** 2).mean()
|
|
self.assertAlmostEqual(results["eval_loss"], expected_loss)
|
|
expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
|
|
self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
|
|
|
|
def test_predict_with_batch_eval_metrics(self):
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
trainer = get_regression_trainer(
|
|
a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir
|
|
)
|
|
results = trainer.predict(trainer.eval_dataset)
|
|
preds = results.predictions
|
|
x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
|
|
gt = 1.5 * x + 2.5
|
|
self.assertTrue(np.allclose(preds, gt))
|
|
expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
|
|
self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
|
|
|
|
# With a number of elements not a round multiple of the batch size
|
|
trainer = get_regression_trainer(
|
|
a=1.5,
|
|
b=2.5,
|
|
eval_len=66,
|
|
compute_metrics=AlmostAccuracyBatched(),
|
|
batch_eval_metrics=True,
|
|
output_dir=tmp_dir,
|
|
)
|
|
results = trainer.predict(trainer.eval_dataset)
|
|
preds = results.predictions
|
|
x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
|
|
self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
|
|
expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
|
|
self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
|
|
|
|
# With more than one output of the model
|
|
trainer = get_regression_trainer(
|
|
a=1.5,
|
|
b=2.5,
|
|
double_output=True,
|
|
compute_metrics=AlmostAccuracyBatched(),
|
|
batch_eval_metrics=True,
|
|
output_dir=tmp_dir,
|
|
)
|
|
preds = trainer.predict(trainer.eval_dataset).predictions
|
|
x = trainer.eval_dataset.x
|
|
self.assertEqual(len(preds), 2)
|
|
self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
|
|
self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
|
|
|
|
# With more than one output/label of the model
|
|
trainer = get_regression_trainer(
|
|
a=1.5,
|
|
b=2.5,
|
|
double_output=True,
|
|
label_names=["labels", "labels_2"],
|
|
compute_metrics=AlmostAccuracyBatched(),
|
|
batch_eval_metrics=True,
|
|
output_dir=tmp_dir,
|
|
)
|
|
outputs = trainer.predict(trainer.eval_dataset)
|
|
preds = outputs.predictions
|
|
labels = outputs.label_ids
|
|
x = trainer.eval_dataset.x
|
|
self.assertEqual(len(preds), 2)
|
|
self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
|
|
self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
|
|
self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
|
|
self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# FP16 / BF16 full eval memory tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerFullEvalMemoryTest(TestCasePlus):
|
|
@require_torch_fp16
|
|
@require_torch_accelerator
|
|
def test_fp16_full_eval(self):
|
|
# this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
|
|
# it's using pretty large safety margins, but small enough to detect broken functionality.
|
|
debug = 0
|
|
n_gpus = backend_device_count(torch_device)
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
bs = 8
|
|
eval_len = 16 * n_gpus
|
|
# make the params somewhat big so that there will be enough RAM consumed to be able to
|
|
# measure things. We should get about 64KB for a+b in fp32
|
|
a = torch.ones(1000, bs) + 0.001
|
|
b = torch.ones(1000, bs) - 0.001
|
|
|
|
# 1. with fp16_full_eval disabled
|
|
trainer = get_regression_trainer(
|
|
a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir
|
|
)
|
|
metrics = trainer.evaluate()
|
|
del trainer
|
|
gc.collect()
|
|
|
|
fp32_init = metrics["init_mem_gpu_alloc_delta"]
|
|
fp32_eval = metrics["eval_mem_gpu_alloc_delta"]
|
|
|
|
if debug:
|
|
print(f"fp32_init {fp32_init}")
|
|
print(f"fp32_eval {fp32_eval}")
|
|
|
|
# here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
|
|
# perfect world: fp32_init == 64<<10
|
|
self.assertGreater(fp32_init, 59_000)
|
|
# after eval should be no extra memory allocated - with a small margin (other than the peak
|
|
# memory consumption for the forward calculation that gets recovered)
|
|
# perfect world: fp32_eval == close to zero
|
|
self.assertLess(fp32_eval, 5_000)
|
|
|
|
# 2. with fp16_full_eval enabled
|
|
trainer = get_regression_trainer(
|
|
a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir
|
|
)
|
|
metrics = trainer.evaluate()
|
|
fp16_init = metrics["init_mem_gpu_alloc_delta"]
|
|
fp16_eval = metrics["eval_mem_gpu_alloc_delta"]
|
|
|
|
if debug:
|
|
print(f"fp16_init {fp16_init}")
|
|
print(f"fp16_eval {fp16_eval}")
|
|
|
|
# here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
|
|
# perfect world: fp16_init == close to zero
|
|
self.assertLess(fp16_init, 5_000)
|
|
# here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
|
|
# perfect world: fp32_init == 32<<10
|
|
self.assertGreater(fp16_eval, 27_000)
|
|
|
|
# 3. relative comparison fp32 vs full fp16
|
|
# should be about half of fp16_init
|
|
# perfect world: fp32_init/2 == fp16_eval
|
|
self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
|
|
|
|
@require_torch_accelerator
|
|
@require_torch_bf16
|
|
def test_bf16_full_eval(self):
|
|
# note: most of the logic is the same as test_fp16_full_eval
|
|
|
|
# this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
|
|
# it's using pretty large safety margins, but small enough to detect broken functionality.
|
|
debug = 0
|
|
n_gpus = backend_device_count(torch_device)
|
|
|
|
bs = 8
|
|
eval_len = 16 * n_gpus
|
|
# make the params somewhat big so that there will be enough RAM consumed to be able to
|
|
# measure things. We should get about 64KB for a+b in fp32
|
|
a = torch.ones(1000, bs) + 0.001
|
|
b = torch.ones(1000, bs) - 0.001
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
# 1. with bf16_full_eval disabled
|
|
trainer = get_regression_trainer(
|
|
a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir
|
|
)
|
|
metrics = trainer.evaluate()
|
|
del trainer
|
|
gc.collect()
|
|
|
|
fp32_init = metrics["init_mem_gpu_alloc_delta"]
|
|
fp32_eval = metrics["eval_mem_gpu_alloc_delta"]
|
|
|
|
if debug:
|
|
print(f"fp32_init {fp32_init}")
|
|
print(f"fp32_eval {fp32_eval}")
|
|
|
|
# here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
|
|
# perfect world: fp32_init == 64<<10
|
|
self.assertGreater(fp32_init, 59_000)
|
|
# after eval should be no extra memory allocated - with a small margin (other than the peak
|
|
# memory consumption for the forward calculation that gets recovered)
|
|
# perfect world: fp32_eval == close to zero
|
|
self.assertLess(fp32_eval, 5_000)
|
|
|
|
# 2. with bf16_full_eval enabled
|
|
trainer = get_regression_trainer(
|
|
a=a, b=b, eval_len=eval_len, bf16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir
|
|
)
|
|
metrics = trainer.evaluate()
|
|
bf16_init = metrics["init_mem_gpu_alloc_delta"]
|
|
bf16_eval = metrics["eval_mem_gpu_alloc_delta"]
|
|
|
|
if debug:
|
|
print(f"bf16_init {bf16_init}")
|
|
print(f"bf16_eval {bf16_eval}")
|
|
|
|
# here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
|
|
# perfect world: bf16_init == close to zero
|
|
self.assertLess(bf16_init, 5_000)
|
|
# here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
|
|
# perfect world: fp32_init == 32<<10
|
|
self.assertGreater(bf16_eval, 27_000)
|
|
|
|
# 3. relative comparison fp32 vs full bf16
|
|
# should be about half of bf16_init
|
|
# perfect world: fp32_init/2 == bf16_eval
|
|
self.assertAlmostEqual(bf16_eval, fp32_init / 2, delta=5_000)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Slow external model eval tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@require_torch
|
|
class TrainerSlowEvalTest(TestCasePlus):
|
|
@slow
|
|
def test_trainer_eval_mrpc(self):
|
|
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
|
|
data_args = GlueDataTrainingArguments(
|
|
task_name="mrpc", data_dir=f"{get_tests_dir()}/fixtures/tests_samples/MRPC", overwrite_cache=True
|
|
)
|
|
eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
training_args = TrainingArguments(output_dir=tmp_dir, use_cpu=True)
|
|
trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset)
|
|
result = trainer.evaluate()
|
|
self.assertLess(result["eval_loss"], 0.2)
|
|
|
|
@slow
|
|
def test_trainer_eval_multiple(self):
|
|
MODEL_ID = "openai-community/gpt2"
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
|
|
|
|
dataset = get_dataset(PATH_SAMPLE_TEXT, tokenizer, 100)
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
training_args = TrainingArguments(
|
|
output_dir=tmp_dir,
|
|
use_cpu=True,
|
|
per_device_eval_batch_size=1,
|
|
)
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
eval_dataset={
|
|
"data1": dataset,
|
|
"data2": dataset,
|
|
},
|
|
)
|
|
result = trainer.evaluate()
|
|
self.assertIn("eval_data1_loss", result)
|
|
self.assertIn("eval_data2_loss", result)
|
|
|
|
@slow
|
|
def test_trainer_eval_lm(self):
|
|
MODEL_ID = "distilbert/distilroberta-base"
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
dataset = get_dataset(PATH_SAMPLE_TEXT, tokenizer, 100)
|
|
self.assertEqual(len(dataset), 31)
|