Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
87 lines
3.1 KiB
Python
87 lines
3.1 KiB
Python
import json
|
|
import os
|
|
import subprocess
|
|
import unittest
|
|
from ast import literal_eval
|
|
|
|
import pytest
|
|
from parameterized import parameterized_class
|
|
|
|
from . import is_sagemaker_available
|
|
|
|
|
|
if is_sagemaker_available():
|
|
from sagemaker import Session, TrainingJobAnalytics
|
|
from sagemaker.huggingface import HuggingFace
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,
|
|
reason="Skipping test because should only be run when releasing minor transformers version",
|
|
)
|
|
@pytest.mark.usefixtures("sm_env")
|
|
@parameterized_class(
|
|
[
|
|
{
|
|
"script": "run_glue.py",
|
|
"model_name_or_path": "distilbert/distilbert-base-cased",
|
|
"instance_type": "ml.g4dn.xlarge",
|
|
"results": {"train_runtime": 650, "eval_accuracy": 0.6, "eval_loss": 0.9},
|
|
},
|
|
]
|
|
)
|
|
class SingleNodeTest(unittest.TestCase):
|
|
def setUp(self):
|
|
subprocess.run(
|
|
f"cp ./examples/pytorch/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),
|
|
encoding="utf-8",
|
|
check=True,
|
|
)
|
|
assert hasattr(self, "env")
|
|
|
|
def create_estimator(self, instance_count=1):
|
|
# creates estimator
|
|
return HuggingFace(
|
|
entry_point=self.script,
|
|
source_dir=self.env.test_path,
|
|
role=self.env.role,
|
|
image_uri=self.env.image_uri,
|
|
base_job_name=f"{self.env.base_job_name}-single",
|
|
instance_count=instance_count,
|
|
instance_type=self.instance_type,
|
|
debugger_hook_config=False,
|
|
hyperparameters={**self.env.hyperparameters, "model_name_or_path": self.model_name_or_path},
|
|
metric_definitions=self.env.metric_definitions,
|
|
py_version="py36",
|
|
)
|
|
|
|
def save_results_as_csv(self, job_name):
|
|
TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
|
|
|
|
def test_glue(self):
|
|
# create estimator
|
|
estimator = self.create_estimator()
|
|
|
|
# run training
|
|
estimator.fit()
|
|
|
|
# result dataframe
|
|
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
|
|
|
|
# extract kpis
|
|
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
|
|
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
|
|
# get train time from SageMaker job, this includes starting, preprocessing, stopping
|
|
train_runtime = (
|
|
Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
|
|
)
|
|
|
|
# assert kpis
|
|
assert train_runtime <= self.results["train_runtime"]
|
|
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
|
|
assert all(t <= self.results["eval_loss"] for t in eval_loss)
|
|
|
|
# dump tests result into json file to share in PR
|
|
with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
|
|
json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
|