# Copyright 2018 the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Trainer evaluation and prediction tests: evaluate, predict, batched metrics, dynamic shapes, iterable datasets, early stopping, FP16/BF16 full eval memory, torch.compile, and MRPC/LM eval. """ import gc import tempfile import numpy as np from transformers import ( AutoTokenizer, TrainingArguments, is_torch_available, ) from transformers.testing_utils import ( TestCasePlus, backend_device_count, get_tests_dir, require_torch, require_torch_accelerator, require_torch_bf16, require_torch_fp16, slow, torch_device, ) from .trainer_test_utils import ( PATH_SAMPLE_TEXT, AlmostAccuracy, AlmostAccuracyBatched, RegressionDataset, RegressionDictModel, TrainerIntegrationCommon, get_dataset, get_regression_trainer, ) if is_torch_available(): import torch from transformers import ( AutoModelForCausalLM, AutoModelForSequenceClassification, GlueDataset, GlueDataTrainingArguments, Trainer, ) # --------------------------------------------------------------------------- # Core evaluate / predict tests # --------------------------------------------------------------------------- @require_torch class TrainerEvaluationTest(TestCasePlus, TrainerIntegrationCommon): def setUp(self): super().setUp() args = TrainingArguments("..") self.n_epochs = args.num_train_epochs self.batch_size = args.train_batch_size def test_evaluate(self): with tempfile.TemporaryDirectory() as tmp_dir: trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir) results = trainer.evaluate() x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] pred = 1.5 * x + 2.5 expected_loss = ((pred - y) ** 2).mean() self.assertAlmostEqual(results["eval_loss"], expected_loss) expected_acc = AlmostAccuracy()((pred, y))["accuracy"] self.assertAlmostEqual(results["eval_accuracy"], expected_acc) # With a number of elements not a round multiple of the batch size trainer = get_regression_trainer( a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir ) results = trainer.evaluate() x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] pred = 1.5 * x + 2.5 expected_loss = ((pred - y) ** 2).mean() self.assertAlmostEqual(results["eval_loss"], expected_loss) expected_acc = AlmostAccuracy()((pred, y))["accuracy"] self.assertAlmostEqual(results["eval_accuracy"], expected_acc) # With logits preprocess trainer = get_regression_trainer( a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), preprocess_logits_for_metrics=lambda logits, labels: logits + 1, output_dir=tmp_dir, ) results = trainer.evaluate() x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] pred = 1.5 * x + 2.5 expected_loss = ((pred - y) ** 2).mean() self.assertAlmostEqual(results["eval_loss"], expected_loss) expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] self.assertAlmostEqual(results["eval_accuracy"], expected_acc) def test_predict(self): with tempfile.TemporaryDirectory() as tmp_dir: trainer = get_regression_trainer(a=1.5, b=2.5, output_dir=tmp_dir) preds = trainer.predict(trainer.eval_dataset).predictions x = trainer.eval_dataset.x self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) # With a number of elements not a round multiple of the batch size trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, output_dir=tmp_dir) preds = trainer.predict(trainer.eval_dataset).predictions x = trainer.eval_dataset.x self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) # With more than one output of the model trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, output_dir=tmp_dir) preds = trainer.predict(trainer.eval_dataset).predictions x = trainer.eval_dataset.x self.assertEqual(len(preds), 2) self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) # With more than one output/label of the model trainer = get_regression_trainer( a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], output_dir=tmp_dir ) outputs = trainer.predict(trainer.eval_dataset) preds = outputs.predictions labels = outputs.label_ids x = trainer.eval_dataset.x self.assertEqual(len(preds), 2) self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) def test_train_and_predict_loss_parity(self): """ Tests that the loss computed during a training_step is the same as the one computed during prediction_step. for the same inputs """ model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM") # Create a dummy batch of inputs inputs = {} inputs["input_ids"] = [] for row_ind in range(4): seq_len = torch.randint(32, 64, (1,)).item() x = torch.randint(1, 100, (seq_len,)) inputs["input_ids"].append(x) inputs["input_ids"] = torch.nn.utils.rnn.pad_sequence(inputs["input_ids"], batch_first=True, padding_value=0) inputs["labels"] = inputs["input_ids"].clone() inputs["labels"][inputs["input_ids"] == 0] = -100 num_items_in_batch = inputs["labels"][..., 1:].ne(-100).sum().item() def custom_loss_func(outputs, labels, num_items_in_batch=None): logits = outputs["logits"] loss_fct = torch.nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) if num_items_in_batch is not None: return loss / num_items_in_batch # multiply by number of items to get the sum return loss trainer = Trainer(model, train_dataset=None, compute_loss_func=custom_loss_func) # creating log history of trainer, results don't matter train_loss = trainer.training_step(model, inputs, num_items_in_batch) predict_loss = trainer.prediction_step(model, inputs, prediction_loss_only=True)[0] torch.testing.assert_close(train_loss, predict_loss, atol=1e-6, rtol=0) def test_eval_use_gather_object(self): train_dataset = RegressionDataset() eval_dataset = RegressionDataset() model = RegressionDictModel() with tempfile.TemporaryDirectory() as tmp_dir: args = TrainingArguments(tmp_dir, eval_use_gather_object=True) trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) trainer.train() _ = trainer.evaluate() _ = trainer.predict(eval_dataset) # --------------------------------------------------------------------------- # Batch eval metrics tests # --------------------------------------------------------------------------- @require_torch class TrainerBatchEvalMetricsTest(TestCasePlus, TrainerIntegrationCommon): def setUp(self): super().setUp() args = TrainingArguments("..") self.n_epochs = args.num_train_epochs self.batch_size = args.train_batch_size def test_evaluate_with_batch_eval_metrics(self): with tempfile.TemporaryDirectory() as tmp_dir: trainer = get_regression_trainer( a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir ) results = trainer.evaluate() x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] pred = 1.5 * x + 2.5 expected_loss = ((pred - y) ** 2).mean() self.assertAlmostEqual(results["eval_loss"], expected_loss) expected_acc = AlmostAccuracy()((pred, y))["accuracy"] self.assertAlmostEqual(results["eval_accuracy"], expected_acc) # With a number of elements not a round multiple of the batch size trainer = get_regression_trainer( a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir, ) results = trainer.evaluate() x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] pred = 1.5 * x + 2.5 expected_loss = ((pred - y) ** 2).mean() self.assertAlmostEqual(results["eval_loss"], expected_loss) expected_acc = AlmostAccuracy()((pred, y))["accuracy"] self.assertAlmostEqual(results["eval_accuracy"], expected_acc) # With logits preprocess trainer = get_regression_trainer( a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, preprocess_logits_for_metrics=lambda logits, labels: logits + 1, output_dir=tmp_dir, ) results = trainer.evaluate() x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] pred = 1.5 * x + 2.5 expected_loss = ((pred - y) ** 2).mean() self.assertAlmostEqual(results["eval_loss"], expected_loss) expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] self.assertAlmostEqual(results["eval_accuracy"], expected_acc) def test_predict_with_batch_eval_metrics(self): with tempfile.TemporaryDirectory() as tmp_dir: trainer = get_regression_trainer( a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir ) results = trainer.predict(trainer.eval_dataset) preds = results.predictions x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] gt = 1.5 * x + 2.5 self.assertTrue(np.allclose(preds, gt)) expected_acc = AlmostAccuracy()((preds, y))["accuracy"] self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) # With a number of elements not a round multiple of the batch size trainer = get_regression_trainer( a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir, ) results = trainer.predict(trainer.eval_dataset) preds = results.predictions x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) expected_acc = AlmostAccuracy()((preds, y))["accuracy"] self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) # With more than one output of the model trainer = get_regression_trainer( a=1.5, b=2.5, double_output=True, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir, ) preds = trainer.predict(trainer.eval_dataset).predictions x = trainer.eval_dataset.x self.assertEqual(len(preds), 2) self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) # With more than one output/label of the model trainer = get_regression_trainer( a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir, ) outputs = trainer.predict(trainer.eval_dataset) preds = outputs.predictions labels = outputs.label_ids x = trainer.eval_dataset.x self.assertEqual(len(preds), 2) self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) # --------------------------------------------------------------------------- # FP16 / BF16 full eval memory tests # --------------------------------------------------------------------------- @require_torch class TrainerFullEvalMemoryTest(TestCasePlus): @require_torch_fp16 @require_torch_accelerator def test_fp16_full_eval(self): # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis. # it's using pretty large safety margins, but small enough to detect broken functionality. debug = 0 n_gpus = backend_device_count(torch_device) with tempfile.TemporaryDirectory() as tmp_dir: bs = 8 eval_len = 16 * n_gpus # make the params somewhat big so that there will be enough RAM consumed to be able to # measure things. We should get about 64KB for a+b in fp32 a = torch.ones(1000, bs) + 0.001 b = torch.ones(1000, bs) - 0.001 # 1. with fp16_full_eval disabled trainer = get_regression_trainer( a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir ) metrics = trainer.evaluate() del trainer gc.collect() fp32_init = metrics["init_mem_gpu_alloc_delta"] fp32_eval = metrics["eval_mem_gpu_alloc_delta"] if debug: print(f"fp32_init {fp32_init}") print(f"fp32_eval {fp32_eval}") # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram. # perfect world: fp32_init == 64<<10 self.assertGreater(fp32_init, 59_000) # after eval should be no extra memory allocated - with a small margin (other than the peak # memory consumption for the forward calculation that gets recovered) # perfect world: fp32_eval == close to zero self.assertLess(fp32_eval, 5_000) # 2. with fp16_full_eval enabled trainer = get_regression_trainer( a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir ) metrics = trainer.evaluate() fp16_init = metrics["init_mem_gpu_alloc_delta"] fp16_eval = metrics["eval_mem_gpu_alloc_delta"] if debug: print(f"fp16_init {fp16_init}") print(f"fp16_eval {fp16_eval}") # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0 # perfect world: fp16_init == close to zero self.assertLess(fp16_init, 5_000) # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back) # perfect world: fp32_init == 32<<10 self.assertGreater(fp16_eval, 27_000) # 3. relative comparison fp32 vs full fp16 # should be about half of fp16_init # perfect world: fp32_init/2 == fp16_eval self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000) @require_torch_accelerator @require_torch_bf16 def test_bf16_full_eval(self): # note: most of the logic is the same as test_fp16_full_eval # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis. # it's using pretty large safety margins, but small enough to detect broken functionality. debug = 0 n_gpus = backend_device_count(torch_device) bs = 8 eval_len = 16 * n_gpus # make the params somewhat big so that there will be enough RAM consumed to be able to # measure things. We should get about 64KB for a+b in fp32 a = torch.ones(1000, bs) + 0.001 b = torch.ones(1000, bs) - 0.001 with tempfile.TemporaryDirectory() as tmp_dir: # 1. with bf16_full_eval disabled trainer = get_regression_trainer( a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir ) metrics = trainer.evaluate() del trainer gc.collect() fp32_init = metrics["init_mem_gpu_alloc_delta"] fp32_eval = metrics["eval_mem_gpu_alloc_delta"] if debug: print(f"fp32_init {fp32_init}") print(f"fp32_eval {fp32_eval}") # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram. # perfect world: fp32_init == 64<<10 self.assertGreater(fp32_init, 59_000) # after eval should be no extra memory allocated - with a small margin (other than the peak # memory consumption for the forward calculation that gets recovered) # perfect world: fp32_eval == close to zero self.assertLess(fp32_eval, 5_000) # 2. with bf16_full_eval enabled trainer = get_regression_trainer( a=a, b=b, eval_len=eval_len, bf16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir ) metrics = trainer.evaluate() bf16_init = metrics["init_mem_gpu_alloc_delta"] bf16_eval = metrics["eval_mem_gpu_alloc_delta"] if debug: print(f"bf16_init {bf16_init}") print(f"bf16_eval {bf16_eval}") # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0 # perfect world: bf16_init == close to zero self.assertLess(bf16_init, 5_000) # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back) # perfect world: fp32_init == 32<<10 self.assertGreater(bf16_eval, 27_000) # 3. relative comparison fp32 vs full bf16 # should be about half of bf16_init # perfect world: fp32_init/2 == bf16_eval self.assertAlmostEqual(bf16_eval, fp32_init / 2, delta=5_000) # --------------------------------------------------------------------------- # Slow external model eval tests # --------------------------------------------------------------------------- @require_torch class TrainerSlowEvalTest(TestCasePlus): @slow def test_trainer_eval_mrpc(self): MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir=f"{get_tests_dir()}/fixtures/tests_samples/MRPC", overwrite_cache=True ) eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") with tempfile.TemporaryDirectory() as tmp_dir: training_args = TrainingArguments(output_dir=tmp_dir, use_cpu=True) trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset) result = trainer.evaluate() self.assertLess(result["eval_loss"], 0.2) @slow def test_trainer_eval_multiple(self): MODEL_ID = "openai-community/gpt2" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained(MODEL_ID) dataset = get_dataset(PATH_SAMPLE_TEXT, tokenizer, 100) with tempfile.TemporaryDirectory() as tmp_dir: training_args = TrainingArguments( output_dir=tmp_dir, use_cpu=True, per_device_eval_batch_size=1, ) trainer = Trainer( model=model, args=training_args, eval_dataset={ "data1": dataset, "data2": dataset, }, ) result = trainer.evaluate() self.assertIn("eval_data1_loss", result) self.assertIn("eval_data2_loss", result) @slow def test_trainer_eval_lm(self): MODEL_ID = "distilbert/distilroberta-base" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) dataset = get_dataset(PATH_SAMPLE_TEXT, tokenizer, 100) self.assertEqual(len(dataset), 31)