# Copyright 2018 the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Core Trainer integration tests: reproducibility, gradient accumulation, gradient checkpointing,
mixed precision, logging, NEFTune, memory metrics, and end-to-end training.
"""

import math
import os
import tempfile
from functools import partial

import datasets
import numpy as np
import pytest
import torch
from torch import nn

from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    GPT2Config,
    GPT2LMHeadModel,
    IntervalStrategy,
    LlamaConfig,
    LlamaForCausalLM,
    Trainer,
    TrainingArguments,
    default_data_collator,
    logging,
)
from transformers.integrations import activate_neftune
from transformers.loss.loss_utils import ForCausalLMLoss
from transformers.testing_utils import (
    CaptureLogger,
    LoggingLevel,
    TestCasePlus,
    backend_device_count,
    execute_subprocess_async,
    require_bitsandbytes,
    require_liger_kernel,
    require_non_hpu,
    require_peft,
    require_torch,
    require_torch_accelerator,
    require_torch_bf16,
    require_torch_fp16,
    require_torch_gpu,
    require_torch_multi_accelerator,
    require_torch_non_multi_accelerator,
    require_torch_tf32,
    run_first,
    slow,
    torch_device,
)

from .trainer_test_utils import (
    ATOL,
    PATH_SAMPLE_TEXT,
    RTOL,
    AlmostAccuracy,
    BasicTextGenerationModel,
    RegressionDataset,
    RegressionModel,
    RepeatDataset,
    StoreLossCallback,
    TrainerIntegrationCommon,
    get_dataset,
    get_regression_trainer,
)


# ---------------------------------------------------------------------------
# Mixed precision tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerMixedPrecisionTest(TestCasePlus, TrainerIntegrationCommon):
    """Tests for FP16, BF16, and TF32 mixed precision training."""

    def setUp(self):
        super().setUp()

        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
            trainer.train()
            self.default_trained_model = (trainer.model.a, trainer.model.b)

    def check_trained_model(self, model, **kwargs):
        (a, b) = self.default_trained_model
        torch.testing.assert_close(model.a, a, **kwargs)
        torch.testing.assert_close(model.b, b, **kwargs)

    @require_torch_fp16
    @require_torch_accelerator
    def test_mixed_fp16(self):
        # very basic test
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, fp16=True, logging_steps=1, output_dir=tmp_dir)
            trainer.train()
            self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL)
            log_0 = trainer.state.log_history[:-1][0]
            # check that the grads were properly clipped due to the grad scaler. Otherwise, we get huge values
            self.assertEqual(log_0["grad_norm"] < 100, True)

    @require_torch_bf16
    @require_torch_accelerator
    def test_mixed_bf16(self):
        # very basic test
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir)
            trainer.train()
            self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL)

    @require_torch_gpu
    @require_torch_tf32
    def test_tf32(self):
        # very basic test
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, tf32=True, output_dir=tmp_dir)
            trainer.train()
            self.check_trained_model(trainer.model)


# ---------------------------------------------------------------------------
# DDP kwargs forwarding tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerDDPKwargsTest(TestCasePlus):
    """The `ddp_*` TrainingArguments fields must reach DistributedDataParallelKwargs."""

    def _get_ddp_kwargs(self, **training_args_overrides):
        """Build a Trainer, run _build_accelerator_args, return the DDP kwargs dict."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            args = TrainingArguments(output_dir=tmp_dir, max_steps=1, **training_args_overrides)
            trainer = Trainer(model=RegressionModel(), args=args, train_dataset=RegressionDataset())
            accelerator_args = trainer._build_accelerator_args()
            (handler,) = accelerator_args["kwargs_handlers"]
            return handler

    def test_ddp_static_graph_true_reaches_accelerator(self):
        """ddp_static_graph=True is forwarded as static_graph=True to DistributedDataParallelKwargs."""
        handler = self._get_ddp_kwargs(ddp_static_graph=True)
        self.assertTrue(handler.static_graph)

    def test_ddp_static_graph_false_reaches_accelerator(self):
        """ddp_static_graph=False is forwarded as static_graph=False."""
        handler = self._get_ddp_kwargs(ddp_static_graph=False)
        self.assertFalse(handler.static_graph)

    def test_ddp_static_graph_none_preserves_default(self):
        """ddp_static_graph=None (default) must NOT override DistributedDataParallelKwargs' own default (False).

        Regression guard: the conditional in _build_accelerator_args must keep static_graph out of ddp_kwargs
        when the flag is unset, otherwise clusters not configured for it would silently switch behavior.
        """
        handler = self._get_ddp_kwargs()  # ddp_static_graph unset
        # DistributedDataParallelKwargs default is False. If our conditional is broken and we always injected
        # the attribute, this would still be False only by coincidence. Cross-check with ddp_static_graph=True
        # (above) that the kwarg IS plumbed when set — together these tests pin both directions.
        self.assertFalse(handler.static_graph)


# ---------------------------------------------------------------------------
# Gradient accumulation tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerGradientAccumulationTest(TestCasePlus, TrainerIntegrationCommon):
    """Tests for gradient accumulation loss alignment and batch counting."""

    def test_gradient_accumulation_steps_not_leaked_to_accelerator(self):
        """The Trainer must not pass its gradient_accumulation_steps to the Accelerator. See #45305."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            args = TrainingArguments(
                output_dir=tmp_dir,
                per_device_train_batch_size=4,
                gradient_accumulation_steps=4,
                max_steps=1,
            )
            trainer = Trainer(model=RegressionModel(), args=args, train_dataset=RegressionDataset())
            self.assertEqual(trainer.accelerator.gradient_accumulation_steps, 1)

    @classmethod
    def setUpClass(cls):
        super().setUpClass()
        cls._ga_model_name = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
        tokenizer = AutoTokenizer.from_pretrained(cls._ga_model_name)
        tokenizer.pad_token = tokenizer.eos_token
        dataset = datasets.load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1", split="train[:200]")
        # Filter empty samples to avoid nan losses with small batch sizes
        dataset = dataset.filter(lambda ex: len(ex["text"].strip()) > 0)
        cls._ga_dataset = dataset.map(
            lambda ex: tokenizer(ex["text"], max_length=16, padding="max_length", truncation=True), batched=True
        )
        cls._ga_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    def _check_gradient_accumulation(
        self,
        base_batch_size,
        gas_batch_size,
        gas_steps,
        loss_tolerance,
        model_accepts_loss_kwargs=True,
        compute_loss_func=None,
    ):
        """
        Train twice with the same effective batch (base_batch_size vs gas_batch_size * gas_steps)
        and assert grad norms and losses match.
        """
        model_name = self._ga_model_name
        args_kwargs = {"logging_steps": 1, "max_steps": 3, "learning_rate": 1e-4, "max_grad_norm": 0.0}
        trainer_kwargs = {"train_dataset": self._ga_dataset, "data_collator": self._ga_data_collator}
        if compute_loss_func is not None:
            trainer_kwargs["compute_loss_func"] = compute_loss_func

        with tempfile.TemporaryDirectory() as tmp_dir:
            model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32)
            args = TrainingArguments(
                tmp_dir, per_device_train_batch_size=base_batch_size, gradient_accumulation_steps=1, **args_kwargs
            )
            base_callback = StoreLossCallback()
            trainer = Trainer(model, args, callbacks=[base_callback], **trainer_kwargs)
            if not model_accepts_loss_kwargs:
                trainer.model_accepts_loss_kwargs = False
            trainer.train()
            base_grad_norms = [h["grad_norm"] for h in trainer.state.log_history if "grad_norm" in h]

            model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32)
            args = TrainingArguments(
                tmp_dir,
                per_device_train_batch_size=gas_batch_size,
                gradient_accumulation_steps=gas_steps,
                **args_kwargs,
            )
            gas_callback = StoreLossCallback()
            trainer = Trainer(model, args, callbacks=[gas_callback], **trainer_kwargs)
            if not model_accepts_loss_kwargs:
                trainer.model_accepts_loss_kwargs = False
            trainer.train()
            gas_grad_norms = [h["grad_norm"] for h in trainer.state.log_history if "grad_norm" in h]

        for step, (base_gn, gas_gn) in enumerate(zip(base_grad_norms, gas_grad_norms)):
            ratio = gas_gn / base_gn if base_gn > 0 else float("inf")
            self.assertAlmostEqual(
                ratio, 1.0, delta=0.1, msg=f"Step {step}: grad_norm ratio {ratio:.2f} — GAS leak suspected"
            )
        loss_diff = [abs(b - g) for b, g in zip(base_callback.losses, gas_callback.losses)]
        self.assertLess(max(loss_diff), loss_tolerance, f"Loss difference {max(loss_diff)} exceeds {loss_tolerance}")

    def test_gradient_accumulation_grad_norm_with_num_items_in_batch(self):
        """
        With model_accepts_loss_kwargs=True the model handles loss averaging via
        num_items_in_batch. Grad norms and losses must match between a large-batch
        baseline and an equivalent GAS run.
        """
        # Tight tolerance: num_items_in_batch properly averages loss across micro-batches
        self._check_gradient_accumulation(base_batch_size=8, gas_batch_size=1, gas_steps=8, loss_tolerance=0.001)
        self._check_gradient_accumulation(base_batch_size=8, gas_batch_size=4, gas_steps=2, loss_tolerance=0.001)

    def test_gradient_accumulation_grad_norm_without_num_items_in_batch(self):
        """
        With model_accepts_loss_kwargs=False the Trainer scales loss by GAS
        itself. Grad norms and losses must still match between a large-batch
        baseline and an equivalent GAS run.
        """
        # Looser tolerance: without num_items_in_batch each micro-batch is independently
        # mean-reduced, so losses won't match as tightly.
        self._check_gradient_accumulation(
            base_batch_size=8,
            gas_batch_size=4,
            gas_steps=2,
            loss_tolerance=0.1,
            model_accepts_loss_kwargs=False,
        )

    def test_gradient_accumulation_grad_norm_with_compute_loss_func(self):
        """
        With a custom compute_loss_func that uses num_items_in_batch, grad norms
        and losses must match between a large-batch baseline and an equivalent GAS run.
        """
        vocab_size = AutoModelForCausalLM.from_pretrained(self._ga_model_name, dtype=torch.float32).config.vocab_size

        def compute_loss(logits, labels, vocab_size, num_items_in_batch):
            return ForCausalLMLoss(logits["logits"], labels, vocab_size, num_items_in_batch)

        # Tight tolerance: compute_loss_func uses num_items_in_batch to properly average loss
        self._check_gradient_accumulation(
            base_batch_size=8,
            gas_batch_size=1,
            gas_steps=8,
            loss_tolerance=0.001,
            compute_loss_func=partial(compute_loss, vocab_size=vocab_size),
        )

    def test_num_items_in_batch_causal_lm(self):
        """
        For a causal LM, `_get_num_items_in_batch` must count over `labels[..., 1:]` because
        ForCausalLMLoss shifts labels (position 0 is never a prediction target). When the
        batch already exposes `shift_labels`, that tensor must be used as-is.
        """
        with tempfile.TemporaryDirectory() as tmp_dir:
            model = AutoModelForCausalLM.from_pretrained(self._ga_model_name, dtype=torch.float32)
            trainer = Trainer(
                model=model,
                args=TrainingArguments(output_dir=tmp_dir, per_device_train_batch_size=2),
                train_dataset=self._ga_dataset,
                data_collator=self._ga_data_collator,
            )
            self.assertTrue(trainer._loss_shifts_labels)

            # batch[0]: 5 valid label positions, 3 padding (-100) → 5 - 1 = 4 after the shift.
            # batch[1]: 8 valid label positions, 0 padding         → 8 - 1 = 7 after the shift.
            # Trainer must not count position 0 of each row → expected total = 4 + 7 = 11.
            batch_samples = [
                {"labels": torch.tensor([[1, 2, 3, 4, 5, -100, -100, -100]])},
                {"labels": torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8]])},
            ]
            num_items = trainer._get_num_items_in_batch(batch_samples, torch.device("cpu"))
            self.assertEqual(int(num_items), 11)

            # If the collator already pre-shifts labels (`shift_labels` present), use it as-is and
            # do NOT slice again. Each row here has 4 valid positions → expected total = 8.
            batch_samples = [
                {
                    "labels": torch.tensor([[1, 2, 3, 4, 5]]),
                    "shift_labels": torch.tensor([[2, 3, 4, 5, -100]]),
                },
                {
                    "labels": torch.tensor([[1, 2, 3, 4, 5]]),
                    "shift_labels": torch.tensor([[2, 3, 4, 5, -100]]),
                },
            ]
            num_items = trainer._get_num_items_in_batch(batch_samples, torch.device("cpu"))
            self.assertEqual(int(num_items), 8)

    def test_num_items_in_batch_non_causal_lm(self):
        """For non-causal-LM losses, `_get_num_items_in_batch` must count the full label tensor."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            config = LlamaConfig(
                vocab_size=64, hidden_size=16, intermediate_size=32, num_hidden_layers=2, num_attention_heads=2
            )
            # ForTokenClassification → LOSS_MAPPING entry is not ForCausalLMLoss → no shift.
            from transformers import LlamaForTokenClassification

            model = LlamaForTokenClassification(config)
            trainer = Trainer(
                model=model,
                args=TrainingArguments(output_dir=tmp_dir, per_device_train_batch_size=2),
            )
            self.assertFalse(trainer._loss_shifts_labels)

            # 5 valid + 8 valid = 13 (no shift).
            batch_samples = [
                {"labels": torch.tensor([[1, 2, 3, 4, 5, -100, -100, -100]])},
                {"labels": torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8]])},
            ]
            num_items = trainer._get_num_items_in_batch(batch_samples, torch.device("cpu"))
            self.assertEqual(int(num_items), 13)

    @require_torch_multi_accelerator
    def test_num_batches_in_training_with_gradient_accumulation(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            for num_train_epochs in [1, 2]:
                for train_len in [123, 120]:
                    trainer = get_regression_trainer(
                        train_len=train_len,
                        per_device_train_batch_size=4,
                        gradient_accumulation_steps=5,
                        num_train_epochs=num_train_epochs,
                        output_dir=tmp_dir,
                    )

                    total_batch_samples = []

                    def wrap_get_batch_samples(fn):
                        def wrapped_fn(epoch_iterator, num_batches, device):
                            self.assertGreater(num_batches, 0)
                            batch_samples, num_items_in_batch = fn(epoch_iterator, num_batches, device)
                            self.assertEqual(len(batch_samples), num_batches)
                            total_batch_samples.append(num_batches)
                            return batch_samples, num_items_in_batch

                        return wrapped_fn

                    trainer.get_batch_samples = wrap_get_batch_samples(trainer.get_batch_samples)

                    trainer.train()

                    self.assertEqual(len(trainer.get_train_dataloader()) * num_train_epochs, sum(total_batch_samples))


# ---------------------------------------------------------------------------
# Gradient checkpointing tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerGradientCheckpointingTest(TestCasePlus):
    """Tests for gradient checkpointing during training."""

    def test_gradient_checkpointing(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                per_device_train_batch_size=1,
                learning_rate=0.1,
                gradient_checkpointing=True,
                output_dir=tmp_dir,
            )
            previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()}

            trainer.train()

            # Check if model weights have been updated
            for k, v in trainer.model.named_parameters():
                self.assertFalse(
                    torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4),
                    f"Model weights for {k} have not been updated",
                )


# ---------------------------------------------------------------------------
# NEFTune tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerNEFTuneTest(TestCasePlus):
    """Tests for NEFTune noise injection during training."""

    def test_neftune(self):
        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)

        # Trainer without inf/nan filter
        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
            neftune_noise_alpha=0.4,
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)

        activate_neftune(trainer.model, trainer.args.neftune_noise_alpha)

        dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device)

        emb1 = trainer.model.get_input_embeddings()(dummy_input)
        emb2 = trainer.model.get_input_embeddings()(dummy_input)

        self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!")

        # redefine the model
        tiny_gpt2 = GPT2LMHeadModel(config)
        # Trainer without inf/nan filter
        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
            neftune_noise_alpha=0.4,
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)

        # Check that it trains without errors
        trainer.train()

        # Make sure forward pass works fine
        _ = trainer.model(dummy_input)
        self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0)

        trainer.model.eval()

        # Check that we get identical embeddings just in case
        emb1 = trainer.model.get_input_embeddings()(dummy_input)
        emb2 = trainer.model.get_input_embeddings()(dummy_input)
        torch.testing.assert_close(emb1, emb2)


# ---------------------------------------------------------------------------
# Logging tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerLoggingTest(TestCasePlus):
    """Tests for logging behavior: inf/nan filter and log levels."""

    def test_logging_inf_nan_filter(self):
        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)

        # Trainer without inf/nan filter
        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            learning_rate=1e9,
            logging_steps=5,
            logging_nan_inf_filter=False,
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
        trainer.train()
        log_history_no_filter = trainer.state.log_history

        # Trainer with inf/nan filter
        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            learning_rate=1e9,
            logging_steps=5,
            logging_nan_inf_filter=True,
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
        trainer.train()
        log_history_filter = trainer.state.log_history

        def is_any_loss_nan_or_inf(log_history):
            losses = [l["loss"] for l in log_history[:-1]]
            return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses)

        self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter))
        self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))

    def test_log_level(self):
        # testing only --log_level (--log_level_replica requires multiple gpus and DDP and is tested elsewhere)
        logger = logging.get_logger()
        log_info_string = "Running training"

        # test with the default log_level - should be the same as before and thus we test depending on is_info
        is_info = logging.get_verbosity() <= 20

        with tempfile.TemporaryDirectory() as tmp_dir:
            with CaptureLogger(logger) as cl:
                trainer = get_regression_trainer(output_dir=tmp_dir)
                trainer.train()
            if is_info:
                self.assertIn(log_info_string, cl.out)
            else:
                self.assertNotIn(log_info_string, cl.out)

            with LoggingLevel(logging.INFO):
                # test with low log_level - lower than info
                with CaptureLogger(logger) as cl:
                    trainer = get_regression_trainer(log_level="debug", output_dir=tmp_dir)
                    trainer.train()
                self.assertIn(log_info_string, cl.out)

            with LoggingLevel(logging.INFO):
                # test with high log_level - should be quiet
                with CaptureLogger(logger) as cl:
                    trainer = get_regression_trainer(log_level="error", output_dir=tmp_dir)
                    trainer.train()
                self.assertNotIn(log_info_string, cl.out)


# ---------------------------------------------------------------------------
# Metrics tests (FLOS, memory, input tokens)
# ---------------------------------------------------------------------------


@require_torch
class TrainerMetricsTest(TestCasePlus):
    """Tests for FLOS extraction, memory metrics, and input token counting."""

    def test_flos_extraction(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)

            def assert_flos_extraction(trainer, wrapped_model_to_check):
                self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check))
                self.assertGreaterEqual(
                    getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0
                )

            # with plain model
            assert_flos_extraction(trainer, trainer.model)

            # with enforced DataParallel
            assert_flos_extraction(trainer, nn.DataParallel(trainer.model))

            trainer.train()
            self.assertTrue(isinstance(trainer.state.total_flos, float))

    def check_mem_metrics(self, trainer, check_func):
        metrics = trainer.train().metrics
        check_func("init_mem_cpu_alloc_delta", metrics)
        check_func("train_mem_cpu_alloc_delta", metrics)
        if backend_device_count(torch_device) > 0:
            check_func("init_mem_gpu_alloc_delta", metrics)
            check_func("train_mem_gpu_alloc_delta", metrics)

        metrics = trainer.evaluate()
        check_func("eval_mem_cpu_alloc_delta", metrics)
        if backend_device_count(torch_device) > 0:
            check_func("eval_mem_gpu_alloc_delta", metrics)

        metrics = trainer.predict(RegressionDataset()).metrics
        check_func("test_mem_cpu_alloc_delta", metrics)
        if backend_device_count(torch_device) > 0:
            check_func("test_mem_gpu_alloc_delta", metrics)

    def test_mem_metrics(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            # with mem metrics enabled
            trainer = get_regression_trainer(skip_memory_metrics=False, output_dir=tmp_dir)
            self.check_mem_metrics(trainer, self.assertIn)

            # with mem metrics disabled
            trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir)
            self.check_mem_metrics(trainer, self.assertNotIn)

    def test_include_num_input_tokens_seen(self):
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        tokenizer.pad_token = "[PAD]"
        model.config.pad_token_id = tokenizer.pad_token_id

        sentences = ["This is a short sentence.", "This is a much longer sentence that will require padding."]
        labels = torch.tensor([0, 1])

        # 1. Test with attention_mask
        tokenized_dataset_with_mask = tokenizer(sentences, truncation=True, padding="longest", return_tensors="pt")
        tokenized_dataset_with_mask["labels"] = labels
        dataset_with_mask = datasets.Dataset.from_dict(tokenized_dataset_with_mask)

        # 2. Test without attention_mask
        tokenized_dataset_no_mask = {k: v for k, v in tokenized_dataset_with_mask.items() if k != "attention_mask"}
        dataset_no_mask = datasets.Dataset.from_dict(tokenized_dataset_no_mask)

        # 3. Test with no padding information
        tokenizer_no_pad = AutoTokenizer.from_pretrained("bert-base-cased")
        tokenizer_no_pad.pad_token = None

        data_collator = default_data_collator

        with tempfile.TemporaryDirectory() as tmp_dir:
            # Test case 1: "non_padding" with attention_mask
            args = TrainingArguments(
                output_dir=tmp_dir,
                include_num_input_tokens_seen="non_padding",
                per_device_train_batch_size=2,
                max_steps=1,
            )
            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=dataset_with_mask,
                data_collator=data_collator,
                processing_class=tokenizer,
            )
            trainer.train()
            attention_mask = tokenized_dataset_with_mask["attention_mask"]
            non_padded_tokens_with_mask = attention_mask.sum().item()
            self.assertEqual(trainer.state.num_input_tokens_seen, non_padded_tokens_with_mask)

            # Test case 2: "non_padding" without attention_mask (fallback to pad_token_id)
            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=dataset_no_mask,
                data_collator=data_collator,
                processing_class=tokenizer,
            )
            trainer.train()
            input_ids = tokenized_dataset_with_mask["input_ids"]  # use original to compute expected
            non_padded_tokens_no_mask = (input_ids != tokenizer.pad_token_id).sum().item()
            self.assertEqual(trainer.state.num_input_tokens_seen, non_padded_tokens_no_mask)

            # Test case 3: "non_padding" with no padding info (fallback to numel)
            with self.assertLogs("transformers.trainer", level="WARNING") as cm:
                trainer = Trainer(
                    model=model,
                    args=args,
                    train_dataset=dataset_no_mask,  # still has input_ids
                    data_collator=data_collator,
                    processing_class=tokenizer_no_pad,  # tokenizer without pad token
                )
                trainer.train()
                self.assertTrue(
                    any("Could not determine method to count non-padding tokens" in log for log in cm.output)
                )
            total_tokens = input_ids.numel()
            self.assertEqual(trainer.state.num_input_tokens_seen, total_tokens)

            # Test case 4: "all"
            args.include_num_input_tokens_seen = "all"
            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=dataset_with_mask,
                data_collator=data_collator,
                processing_class=tokenizer,
            )
            trainer.train()
            self.assertEqual(trainer.state.num_input_tokens_seen, total_tokens)

            # Test case 5: True (backward compatibility)
            args.include_num_input_tokens_seen = True
            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=dataset_with_mask,
                data_collator=data_collator,
                processing_class=tokenizer,
            )
            trainer.train()
            self.assertEqual(trainer.state.num_input_tokens_seen, total_tokens)

    def test_get_num_trainable_parameters(self):
        model = nn.Sequential(nn.Linear(128, 64), nn.Linear(64, 32))
        # in_features * out_features + bias
        layer_1 = 128 * 64 + 64
        layer_2 = 64 * 32 + 32
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir))
            self.assertEqual(trainer.get_num_trainable_parameters(), layer_1 + layer_2)
            # Freeze the last layer
            for param in model[-1].parameters():
                param.requires_grad = False
            self.assertEqual(trainer.get_num_trainable_parameters(), layer_1)


# ---------------------------------------------------------------------------
# Step counting and training loss tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerStepCountingTest(TestCasePlus):
    """Tests for training loss computation, step counting, and epoch handling."""

    def setUp(self):
        super().setUp()
        args = TrainingArguments("..")
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

    def test_training_loss(self):
        n_gpus = max(1, backend_device_count(torch_device))

        # With even logs
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus), output_dir=tmp_dir)
            trainer.train()
            log_history = trainer.state.log_history

            losses = [log["loss"] for log in log_history if "loss" in log]
            train_loss = log_history[-1]["train_loss"]
            self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4)

        # With uneven logs
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(logging_steps=5, output_dir=tmp_dir)
            trainer.train()
            log_history = trainer.state.log_history

            # Training loss should be the same as before
            new_train_loss = log_history[-1]["train_loss"]
            self.assertAlmostEqual(train_loss, new_train_loss, places=4)

    def test_number_of_steps_in_training(self):
        # Regular training has n_epochs * len(train_dl) steps
        tmp_dir = self.get_auto_remove_tmp_dir()
        trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)

        # Check passing num_train_epochs works (and a float version too):
        trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5, output_dir=tmp_dir)
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))

        # If we pass a max_steps, num_train_epochs is ignored
        trainer = get_regression_trainer(learning_rate=0.1, max_steps=10, output_dir=tmp_dir)
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, 10)

    def test_num_train_epochs_in_training(self):
        # len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given.
        # It should give 1 update step for each epoch.
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                max_steps=3,
                train_len=64,
                per_device_train_batch_size=16,
                gradient_accumulation_steps=5,
                output_dir=tmp_dir,
            )
            train_output = trainer.train()
            self.assertEqual(train_output.global_step, 3)

            # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
            # len(train_dl) < gradient_accumulation_steps.
            trainer = get_regression_trainer(
                train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5, output_dir=tmp_dir
            )
            train_output = trainer.train()
            self.assertEqual(train_output.global_step, int(self.n_epochs))


# ---------------------------------------------------------------------------
# Reproducibility tests (pre-run training to check determinism across configs)
# ---------------------------------------------------------------------------


@require_torch
class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
    """
    Only tests that want to tap into the auto-pre-run 2 trainings:
    - self.default_trained_model
    - self.alternate_trained_model
    directly, or via check_trained_model
    """

    def setUp(self):
        super().setUp()
        args = TrainingArguments("..")
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size

        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
            trainer.train()
            self.default_trained_model = (trainer.model.a, trainer.model.b)

        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir)
            trainer.train()
            self.alternate_trained_model = (trainer.model.a, trainer.model.b)

    def check_trained_model(self, model, alternate_seed=False, **kwargs):
        (a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
        torch.testing.assert_close(model.a, a, **kwargs)
        torch.testing.assert_close(model.b, b, **kwargs)

    def test_reproducible_training(self):
        # Checks that training worked, model trained and seed made a reproducible training.
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
            trainer.train()
            self.check_trained_model(trainer.model)

        # Checks that a different seed gets different (reproducible) results.
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir)
            trainer.train()
            self.check_trained_model(trainer.model, alternate_seed=True)

    def test_trainer_with_datasets(self):
        np.random.seed(42)
        x = np.random.normal(size=(64,)).astype(np.float32)
        y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,)).astype(np.float32)
        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})

        # Base training. Should have the same results as test_reproducible_training
        model = RegressionModel()
        with tempfile.TemporaryDirectory() as tmp_dir:
            args = TrainingArguments(tmp_dir, learning_rate=0.1)
            trainer = Trainer(model, args, train_dataset=train_dataset)
            trainer.train()
            self.check_trained_model(trainer.model)

            # Can return tensors.
            train_dataset.set_format(type="torch", dtype=torch.float32)
            model = RegressionModel()
            trainer = Trainer(model, args, train_dataset=train_dataset)
            trainer.train()
            self.check_trained_model(trainer.model)

            # Adding one column not used by the model should have no impact
            z = np.random.normal(size=(64,)).astype(np.float32)
            train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
            model = RegressionModel()
            trainer = Trainer(model, args, train_dataset=train_dataset)
            trainer.train()
            self.check_trained_model(trainer.model)

    def test_model_init(self):
        train_dataset = RegressionDataset()
        with tempfile.TemporaryDirectory() as tmp_dir:
            args = TrainingArguments(tmp_dir, learning_rate=0.1)
            trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
            trainer.train()
            self.check_trained_model(trainer.model)

            # Re-training should restart from scratch, thus lead the same results.
            trainer.train()
            self.check_trained_model(trainer.model)

            # Re-training should restart from scratch, thus lead the same results and new seed should be used.
            trainer.args.seed = 314
            trainer.train()
            self.check_trained_model(trainer.model, alternate_seed=True)


# ---------------------------------------------------------------------------
# Torch compile tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerTorchCompileTest(TestCasePlus):
    @pytest.mark.torch_compile_test
    def test_torch_compile_loss_func_compatibility(self):
        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
        tiny_llama = LlamaForCausalLM(config)

        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)

        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            per_device_train_batch_size=2,
            torch_compile=True,
            max_steps=1,  # compile happens on the first step
        )
        trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset)  # noqa
        trainer.train()

    @require_peft
    @require_bitsandbytes
    @pytest.mark.torch_compile_test
    def test_bnb_compile(self):
        from peft import LoraConfig, get_peft_model

        # Simply tests if initializing a Trainer with a PEFT + compiled model works out of the box
        # QLoRA + torch compile is not really supported yet, but we should at least support the model
        # loading and let torch throw the
        tiny_model = AutoModelForCausalLM.from_pretrained(
            "hf-internal-testing/tiny-random-LlamaForCausalLM",
            quantization_config=BitsAndBytesConfig(load_in_4bit=True),
        )

        peft_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["q_proj", "k_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )
        tiny_model = get_peft_model(tiny_model, peft_config)

        tiny_model = torch.compile(tiny_model)

        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)

        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
        )
        with self.assertRaises(ValueError):
            _ = Trainer(tiny_model, args, train_dataset=train_dataset)  # noqa

    @require_torch_accelerator
    @pytest.mark.torch_compile_test
    def test_torch_compile_train(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(output_dir=tmp_dir)
            metrics = trainer.train()
            original_train_loss = metrics.training_loss

            trainer = get_regression_trainer(torch_compile=True, output_dir=tmp_dir)
            metrics = trainer.train()
            self.assertAlmostEqual(metrics.training_loss, original_train_loss)

    @require_torch_accelerator
    @pytest.mark.torch_compile_test
    def test_torch_compile_eval(self):
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(output_dir=tmp_dir)
            metrics = trainer.evaluate()
            original_eval_loss = metrics["eval_loss"]

            trainer = get_regression_trainer(torch_compile=True, output_dir=tmp_dir)
            metrics = trainer.evaluate()

            self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss, delta=1e-6)


# ---------------------------------------------------------------------------
# Early stopping tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerEarlyStoppingTest(TestCasePlus):
    def test_early_stopping_callback(self):
        # early stopping stops training before num_training_epochs
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                num_train_epochs=20,
                gradient_accumulation_steps=1,
                per_device_train_batch_size=16,
                load_best_model_at_end=True,
                eval_strategy=IntervalStrategy.EPOCH,
                save_strategy=IntervalStrategy.EPOCH,
                compute_metrics=AlmostAccuracy(),
                metric_for_best_model="accuracy",
            )
            trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
            train_output = trainer.train()
            self.assertLess(train_output.global_step, 20 * 64 / 16)

        # Invalid inputs to trainer with early stopping callback result in assertion error
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                num_train_epochs=20,
                gradient_accumulation_steps=1,
                per_device_train_batch_size=16,
                eval_strategy=IntervalStrategy.EPOCH,
                compute_metrics=AlmostAccuracy(),
                metric_for_best_model="accuracy",
            )
            trainer.add_callback(EarlyStoppingCallback(1))
            self.assertEqual(trainer.state.global_step, 0)
            try:
                trainer.train()
            except AssertionError:
                self.assertEqual(trainer.state.global_step, 0)

        # even if load_best_model_at_end is False, `best_model_checkpoint` should be set
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                num_train_epochs=20,
                gradient_accumulation_steps=1,
                per_device_train_batch_size=16,
                load_best_model_at_end=False,
                eval_strategy=IntervalStrategy.EPOCH,
                save_strategy=IntervalStrategy.EPOCH,
                compute_metrics=AlmostAccuracy(),
                metric_for_best_model="accuracy",
            )
            trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
            train_output = trainer.train()
            self.assertIsNotNone(trainer.state.best_model_checkpoint)


# ---------------------------------------------------------------------------
# Liger kernel tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerLigerKernelTest(TestCasePlus):
    @require_liger_kernel
    def test_use_liger_kernel_patching(self):
        import importlib

        from liger_kernel.transformers import liger_rotary_pos_emb

        from transformers.integrations.liger import apply_liger_kernel
        from transformers.models.llama import modeling_llama

        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
        tiny_llama = LlamaForCausalLM(config)

        # Spot check that modeling code and model instance variables are not yet patched
        self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
        self.assertFalse("LigerRMSNorm" in tiny_llama.model.norm.__repr__())

        apply_liger_kernel(tiny_llama, {})

        # Spot check that modeling code and model instance variables are patched
        self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
        self.assertTrue("LigerRMSNorm" in tiny_llama.model.norm.__repr__())

        # Restore the original module to avoid leaking monkey patches to other tests
        importlib.reload(modeling_llama)

    @require_liger_kernel
    def test_use_liger_kernel_custom_config_patching(self):
        import importlib

        from liger_kernel.transformers import LigerRMSNorm

        from transformers.integrations.liger import apply_liger_kernel
        from transformers.models.llama import modeling_llama

        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
        tiny_llama = LlamaForCausalLM(config)

        apply_liger_kernel(tiny_llama, {"rms_norm": False})

        # Check that the RMSNorm kernel is not applied as specified in the config
        self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))

        # Restore the original module to avoid leaking monkey patches to other tests
        importlib.reload(modeling_llama)

    @require_liger_kernel
    @require_torch_accelerator
    @require_torch_non_multi_accelerator  # Don't work with DP
    def test_use_liger_kernel_trainer(self):
        import importlib

        from transformers.models.llama import modeling_llama

        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
        tiny_llama = LlamaForCausalLM(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-2,
            logging_steps=5,
            max_steps=20,
            use_liger_kernel=True,
        )
        Trainer(tiny_llama, args, train_dataset=train_dataset).train()

        # Restore the original module to avoid leaking monkey patches to other tests
        importlib.reload(modeling_llama)

    @require_liger_kernel
    @require_torch_accelerator
    @require_torch_non_multi_accelerator  # don't work with DP
    def test_use_liger_kernel_custom_config_trainer(self):
        import importlib

        from transformers.models.llama import modeling_llama

        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
        tiny_llama = LlamaForCausalLM(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-2,
            logging_steps=5,
            max_steps=20,
            use_liger_kernel=True,
            liger_kernel_config={"rms_norm": False, "cross_entropy": True, "fused_linear_cross_entropy": False},
        )
        Trainer(tiny_llama, args, train_dataset=train_dataset).train()

        # Restore the original module to avoid leaking monkey patches to other tests
        importlib.reload(modeling_llama)


# ---------------------------------------------------------------------------
# Miscellaneous integration tests
# ---------------------------------------------------------------------------


@require_torch
class TrainerIntegrationTest(TestCasePlus):
    """Integration tests: compatibility, and e2e."""

    @slow
    @run_first
    @require_non_hpu
    @require_torch_multi_accelerator
    def test_end_to_end_example(self):
        # Tests that `translation.py` will run without issues
        script_path = os.path.abspath(
            os.path.join(
                os.path.dirname(__file__), "..", "..", "examples", "pytorch", "translation", "run_translation.py"
            )
        )

        with tempfile.TemporaryDirectory() as tmpdir:
            command = [
                "accelerate",
                "launch",
                script_path,
                "--model_name_or_path",
                "google-t5/t5-small",
                "--per_device_train_batch_size",
                "1",
                "--output_dir",
                tmpdir,
                "--do_train",
                "--max_train_samples",
                "64",
                "--num_train_epochs",
                "1",
                "--dataset_name",
                "wmt16",
                "--dataset_config",
                "ro-en",
                "--source_lang",
                "en",
                "--target_lang",
                "ro",
                "--do_predict",
                "--max_predict_samples",
                "64",
                "--predict_with_generate",
                "--ddp_timeout",
                "60",
            ]
            execute_subprocess_async(command)
            # successful return here == success - any errors would have caused an error or a timeout in the sub-call

    def test_special_token_alignment(self):
        """
        Tests that special token changes in the tokenizer result in model configs updates when using the trainer, to
        ensure special tokens are aligned across configs
        """

        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")

        # add new special tokens to tokenizer, so we can test that trainer aligns the model configs with the tokenizer
        tokenizer.eos_token = "<|im_end|>"
        tokenizer.pad_token = "<|im_end|>"
        tokenizer.bos_token = "<|im_start|>"
        tokenizer.add_special_tokens({"additional_special_tokens": ["<|im_end|>", "<|im_start|>"]})

        # the model needs to have its embedding layer resized accordingly
        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)

        # create a random dataset from the **new** vocab size
        x = torch.randint(0, len(tokenizer), (64,))
        dataset = RepeatDataset(x, length=2)

        with tempfile.TemporaryDirectory() as tmpdir:
            training_args = TrainingArguments(output_dir=tmpdir, max_steps=1, per_device_train_batch_size=1)
            trainer = Trainer(
                model=model,
                args=training_args,
                processing_class=tokenizer,
                train_dataset=dataset,
            )

            # We haven't started training -> not yet aligned
            self.assertNotEqual(trainer.model.config.eos_token_id, tokenizer.eos_token_id)
            self.assertNotEqual(trainer.model.config.pad_token_id, tokenizer.pad_token_id)
            self.assertNotEqual(trainer.model.config.bos_token_id, tokenizer.bos_token_id)

            trainer.train()

            # Must be aligned as soon as we start training
            self.assertEqual(trainer.model.config.eos_token_id, tokenizer.eos_token_id)
            self.assertEqual(trainer.model.config.pad_token_id, tokenizer.pad_token_id)
            self.assertEqual(trainer.model.config.bos_token_id, tokenizer.bos_token_id)

    def test_trainer_works_without_model_config(self):
        """
        Tests that models without a `config` parameter can still be trained.
        This is useful for preserving compatibility with third parties that train different models using the
        transformers Trainer.

        If this test fails, it doesn't imply that there's issues with transformers, but perhaps with third
        parties.
        """

        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
        model = BasicTextGenerationModel(vocab_size=tokenizer.vocab_size, hidden_size=32)

        train_dataset = get_dataset(PATH_SAMPLE_TEXT, tokenizer, 100)

        with tempfile.TemporaryDirectory() as tmpdir:
            training_args = TrainingArguments(
                output_dir=tmpdir, max_steps=5, per_device_train_batch_size=1, use_cpu=True
            )
            trainer = Trainer(
                model=model,
                args=training_args,
                processing_class=tokenizer,
                train_dataset=train_dataset,
            )
            trainer.train()

    def test_training_arguments_are_left_untouched(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
        trainer = get_regression_trainer(output_dir=tmp_dir)
        trainer.train()
        args = TrainingArguments(tmp_dir)
        dict1, dict2 = args.to_dict(), trainer.args.to_dict()
        for key in dict1:
            self.assertEqual(dict1[key], dict2[key])

    def test_double_train_wrap_once(self):
        # test that we don't wrap the model more than once
        # since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
        # example DataParallel(DataParallel(model))

        trainer = get_regression_trainer(output_dir=self.get_auto_remove_tmp_dir())
        trainer.train()
        model_wrapped_before = trainer.model_wrapped
        trainer.train()
        model_wrapped_after = trainer.model_wrapped
        self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice")