transformers/.github/workflows/self-scheduled-intel-gaudi.yml

name: Self-hosted runner (scheduled-intel-gaudi)

on:
  workflow_call:
    inputs:
      job:
        required: true
        type: string
      slack_report_channel:
        required: true
        type: string
      runner_scale_set:
        required: true
        type: string
      ci_event:
        required: true
        type: string
      report_repo_id:
        required: true
        type: string

env:
  NUM_SLICES: 2
  RUN_SLOW: yes
  PT_HPU_LAZY_MODE: 0
  TRANSFORMERS_IS_CI: yes
  PT_ENABLE_INT64_SUPPORT: 1
  HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  HF_HOME: /mnt/cache/.cache/huggingface

permissions:
  contents: read

jobs:
  setup:
    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
    name: Setup
    runs-on: ubuntu-latest
    outputs:
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
    steps:
      - name: Checkout
        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
        with:
          fetch-depth: 0
          persist-credentials: false

      - name: Set up Python
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: "3.10"

      - id: set-matrix
        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
        name: Identify models to test
        working-directory: tests
        env:
          JOB: ${{ inputs.job }}
        run: |
          if [ "$JOB" = "run_models_gpu" ]; then
            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits "$NUM_SLICES")" >> "$GITHUB_OUTPUT"
            echo "slice_ids=$(python3 -c 'import os, sys; print(list(range(int(os.environ[\"NUM_SLICES\"]))))')" >> "$GITHUB_OUTPUT"
          elif [ "$JOB" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> "$GITHUB_OUTPUT"
            echo "slice_ids=[0, 1]" >> "$GITHUB_OUTPUT"
          fi

      - id: set-matrix-quantization
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
        name: Identify quantization method to test
        working-directory: tests
        run: |
          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT

  run_models_gpu:
    if: ${{ inputs.job == 'run_models_gpu' }}
    name: " "
    needs: setup
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi, 2gaudi]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
    with:
      slice_id: ${{ matrix.slice_id }}
      machine_type: ${{ matrix.machine_type }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
    secrets: inherit

  run_trainer_and_fsdp_gpu:
    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
    name: " "
    needs: setup
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi, 2gaudi]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
    with:
      slice_id: ${{ matrix.slice_id }}
      machine_type: ${{ matrix.machine_type }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit

  run_pipelines_torch_gpu:
    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
    name: Pipelines
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi, 2gaudi]
    runs-on:
      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
    container:
      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      options: --runtime=habana
        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
        --env HABANA_VISIBLE_DEVICES
        --env HABANA_VISIBLE_MODULES
        --cap-add=sys_nice
        --shm-size=64G
    steps:
      - name: Checkout
        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
        with:
          fetch-depth: 0
          persist-credentials: false

      - name: Install dependencies
        run: |
          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile

      - name: HL-SMI
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"

      - name: Environment
        run: python3 utils/print_env.py

      - name: Show installed libraries and their versions
        run: pip freeze

      - name: Set `machine_type` for report and artifact names
        shell: bash
        run: |
          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
          fi
          echo "machine_type=$machine_type" >> $GITHUB_ENV

      - name: Run all pipeline tests on Intel Gaudi
        run: |
          python3 -m pytest -v --make-reports="${machine_type}_run_pipelines_torch_gpu_test_reports" tests/pipelines -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
          cat "reports/${machine_type}_run_pipelines_torch_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports

  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
    name: Examples directory
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi]
    runs-on:
      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
    container:
      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      options: --runtime=habana
        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
        --env HABANA_VISIBLE_DEVICES
        --env HABANA_VISIBLE_MODULES
        --cap-add=sys_nice
        --shm-size=64G
    steps:
      - name: Checkout
        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
        with:
          fetch-depth: 0
          persist-credentials: false

      - name: Install dependencies
        run: |
          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile

      - name: HL-SMI
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"

      - name: Environment
        run: |
          python3 utils/print_env.py

      - name: Show installed libraries and their versions
        run: |
          pip freeze

      - name: Set `machine_type` for report and artifact names
        shell: bash
        run: |
          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
          fi
          echo "machine_type=$machine_type" >> $GITHUB_ENV

      - name: Run examples tests on Intel Gaudi
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
          python3 -m pytest -v --make-reports="${machine_type}_run_examples_gpu_test_reports" examples/pytorch -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
          cat "reports/${machine_type}_run_examples_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports

  run_torch_cuda_extensions_gpu:
    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
    name: Intel Gaudi deepspeed tests
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi, 2gaudi]
    runs-on:
      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
    container:
      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      options: --runtime=habana
        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
        --env HABANA_VISIBLE_DEVICES
        --env HABANA_VISIBLE_MODULES
        --cap-add=sys_nice
        --shm-size=64G
    steps:
      - name: Checkout
        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
        with:
          fetch-depth: 0
          persist-credentials: false

      - name: Install dependencies
        run: |
          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
          pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0

      - name: HL-SMI
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"

      - name: Environment
        run: |
          python3 utils/print_env.py

      - name: Show installed libraries and their versions
        run: |
          pip freeze

      - name: Set `machine_type` for report and artifact names
        shell: bash
        run: |
          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
          fi
          echo "machine_type=$machine_type" >> $GITHUB_ENV

      - name: Run all deepspeed tests on intel Gaudi
        run: |
          python3 -m pytest -v --make-reports="${machine_type}_run_torch_cuda_extensions_gpu_test_reports" tests/deepspeed -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
          cat "reports/${machine_type}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports

  send_results:
    name: Slack Report
    needs:
      [
        setup,
        run_models_gpu,
        run_examples_gpu,
        run_torch_cuda_extensions_gpu,
        run_pipelines_torch_gpu,
        run_trainer_and_fsdp_gpu,
      ]
    if: ${{ always() }}
    uses: ./.github/workflows/slack-report.yml
    with:
      job: ${{ inputs.job }}
      setup_status: ${{ needs.setup.result }}
      slack_report_channel: ${{ inputs.slack_report_channel }}
      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      report_repo_id: ${{ inputs.report_repo_id }}
      ci_event: ${{ inputs.ci_event }}

    secrets: inherit