transformers/.github/workflows/check_failed_tests.yml

name: Process failed tests

on:
  workflow_call:
    inputs:
      docker:
        required: true
        type: string
      job:
        required: true
        type: string
      slack_report_channel:
        required: true
        type: string
      ci_event:
        required: true
        type: string
      report_repo_id:
        required: true
        type: string
      commit_sha:
        required: false
        type: string
      pr_number:
        required: false
        type: string
      max_num_runners:
        required: false
        type: number
        default: 4
    outputs:
      is_check_failures_ok:
        description: "Whether the failure checking infrastructure succeeded"
        value: ${{ jobs.check_new_failures.result != 'failure' && jobs.process_new_failures_with_commit_info.result != 'failure' }}

env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  RUN_SLOW: yes
  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
  # This token is created under the bot `hf-transformers-bot`.
  HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1


permissions:
  contents: read

jobs:
  setup_check_new_failures:
    name: "Setup matrix for finding commits"
    runs-on: ubuntu-22.04
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
      n_runners: ${{ steps.set-matrix.outputs.n_runners }}
      process: ${{ steps.set-matrix.outputs.process }}
    steps:
      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        continue-on-error: true
        with:
          name: ci_results_${{ inputs.job }}
          path: ci_results_${{ inputs.job }}

      - name: Set matrix
        id: set-matrix
        env:
          job: ${{ inputs.job }}
          max_num_runners: ${{ inputs.max_num_runners }}
        run: |
          python3 - << 'EOF'
          import json, os, math

          print("Script started")

          job = os.environ["job"]
          filepath = f"ci_results_{job}/new_failures.json"

          print(f"Looking for file: {filepath}")
          print(f"File exists: {os.path.isfile(filepath)}")

          if not os.path.isfile(filepath):
              print("File not found, setting process=false")
              with open(os.environ["GITHUB_OUTPUT"], "a") as f:
                  f.write("process=false\n")
              exit(0)

          with open(filepath) as f:
              reports = json.load(f)

          print(f"Loaded reports with {len(reports)} models")

          n_tests = sum(
              len(model_data.get("failures", model_data).get("single-gpu", []))
              for model_data in reports.values()
          )

          print(f"n_tests: {n_tests}")

          max_num_runners = int(os.environ["max_num_runners"])

          TESTS_PER_RUNNER = 10
          n_runners = max(1, min(max_num_runners, math.ceil(n_tests / TESTS_PER_RUNNER)))

          print(f"n_runners: {n_runners}")

          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
              f.write(f"matrix={json.dumps(list(range(n_runners)))}\n")
              f.write(f"n_runners={n_runners}\n")
              f.write("process=true\n")

          print("Done")
          EOF


  check_new_failures:
    name: "Find commits for new failing tests"
    needs: setup_check_new_failures
    if: needs.setup_check_new_failures.outputs.process == 'true'
    strategy:
      matrix:
        run_idx: ${{ fromJson(needs.setup_check_new_failures.outputs.matrix) }}
    runs-on:
      group: aws-g5-4xlarge-cache
    outputs:
      process: ${{ needs.setup_check_new_failures.outputs.process }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
          name: ci_results_${{ inputs.job }}
          path: /transformers/ci_results_${{ inputs.job }}

      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        env:
          ACTIONS_ARTIFACT_MAX_ARTIFACT_COUNT: 2000
        with:
          pattern: setup_values*
          path: setup_values
          merge-multiple: true
          github-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Prepare some setup values
        run: |
          if [ -f setup_values/prev_workflow_run_id.txt ]; then
            echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV
          else
            echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
          fi

      - name: Update clone
        working-directory: /transformers
        env:
          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
          git fetch origin "$commit_sha" && git checkout "$commit_sha"

      - name: Get `START_SHA`
        working-directory: /transformers/utils
        env:
          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
          echo "START_SHA=$commit_sha" >> $GITHUB_ENV

      # This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
      - name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
        id: pr_info
        if: ${{ inputs.pr_number != '' }}
        uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1
        env:
          PR_NUMBER: ${{ inputs.pr_number }}
          COMMIT_SHA: ${{ inputs.commit_sha }}
        with:
          script: |
            const pull_number = parseInt(process.env.PR_NUMBER, 10);
            const commit_sha = process.env.COMMIT_SHA;

            const { data: pr } = await github.rest.pulls.get({
              owner: context.repo.owner,
              repo: context.repo.repo,
              pull_number,
            });

            const { data: merge_commit } = await github.rest.repos.getCommit({
              owner: pr.base.repo.owner.login,
              repo: pr.base.repo.name,
              ref: commit_sha,
            });

            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);

      # Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow.
      # (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
      - name: Get `END_SHA` from previous CI runs of the same workflow
        working-directory: /transformers/utils
        if: ${{ inputs.pr_number == '' }}
        env:
          ACCESS_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
        run: |
          echo "END_SHA=$(TOKEN="$ACCESS_TOKEN" python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV

      # However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the
      # parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
      # see if a reported failing test is actually ONLY failing on the `merge_commit`.
      - name: Set `END_SHA`
        if: ${{ inputs.pr_number != '' }}
        env:
          merge_commit_base_sha: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
        run: |
          echo "END_SHA=$merge_commit_base_sha" >> $GITHUB_ENV

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py

      - name: Install pytest-flakefinder
        run: python3 -m pip install pytest-flakefinder

      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze

      - name: Check failed tests
        working-directory: /transformers
        env:
          job: ${{ inputs.job }}
          n_runners: ${{ needs.setup_check_new_failures.outputs.n_runners }}
          run_idx: ${{ matrix.run_idx }}
          pr_number: ${{ inputs.pr_number }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: python3 utils/check_bad_commit.py --start_commit "$START_SHA" --end_commit "$END_SHA" --file "ci_results_${job}/new_failures.json" --output_file "new_failures_with_bad_commit_${job}_${run_idx}.json"

      - name: Show results
        working-directory: /transformers
        env:
          job: ${{ inputs.job }}
          run_idx: ${{ matrix.run_idx }}
        run: |
          ls -l "new_failures_with_bad_commit_${job}_${run_idx}.json"
          cat "new_failures_with_bad_commit_${job}_${run_idx}.json"

      - name: Upload artifacts
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}
          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

  process_new_failures_with_commit_info:
    name: "process bad commit reports"
    needs: check_new_failures
    if: needs.check_new_failures.outputs.process == 'true'
    runs-on:
      group: aws-g5-4xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
          name: ci_results_${{ inputs.job }}
          path: /transformers/ci_results_${{ inputs.job }}

      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        env:
          ACTIONS_ARTIFACT_MAX_ARTIFACT_COUNT: 2000
        with:
          pattern: new_failures_with_bad_commit_${{ inputs.job }}*
          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}
          merge-multiple: true
          github-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Check files
        working-directory: /transformers
        env:
          job: ${{ inputs.job }}
        run: |
          ls -la /transformers
          ls -la "/transformers/new_failures_with_bad_commit_${job}"

      # Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
      # to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
      - name: Merge files
        shell: bash
        working-directory: /transformers
        env:
          job: ${{ inputs.job }}
        run: |
          python3 - << 'EOF'
          import json
          import glob
          import os

          job = os.environ["job"]
          pattern = f"/transformers/new_failures_with_bad_commit_{job}/new_failures_with_bad_commit_{job}_*.json"
          files = sorted(glob.glob(pattern))

          if not files:
              print(f"No files found matching: {pattern}")
              exit(1)

          print(f"Found {len(files)} file(s) to merge: {files}")

          merged = {}
          for filepath in files:
              with open(filepath) as f:
                  data = json.load(f)

              for model, model_results in data.items():
                  if model not in merged:
                      merged[model] = {}
                  for gpu_type, failures in model_results.items():
                      if gpu_type not in merged[model]:
                          merged[model][gpu_type] = []
                      merged[model][gpu_type].extend(failures)

              print(f"filepath: {filepath}")
              print(len(data))

          output_path = "/transformers/new_failures_with_bad_commit.json"
          with open(output_path, "w") as f:
              json.dump(merged, f, indent=4)

          print(f"Merged {len(files)} file(s) into {output_path}")
          print(f"n_items: {len(merged)}")
          print(merged)
          EOF

      - name: Update clone
        working-directory: /transformers
        env:
          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
          git fetch origin "$commit_sha" && git checkout "$commit_sha"

      - name: Process report
        shell: bash
        working-directory: /transformers
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
          JOB_NAME: ${{ inputs.job }}
          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        run: |
          {
            echo 'REPORT_TEXT<<EOF'
            python3 utils/process_bad_commit_report.py
            echo EOF
          } >> "$GITHUB_ENV"

      - name: Show results
        working-directory: /transformers
        run: |
          ls -l new_failures_with_bad_commit.json
          cat new_failures_with_bad_commit.json

      - name: Upload artifacts
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: new_failures_with_bad_commit_${{ inputs.job }}
          path: |
            /transformers/new_failures_with_bad_commit.json
            /transformers/new_failures_with_bad_commit_url.txt

      - name: Prepare Slack report title
        working-directory: /transformers
        env:
          ci_event: ${{ inputs.ci_event }}
          job: ${{ inputs.job }}
        run: |
          pip install slack_sdk
          echo "title=$(python3 -c 'import sys; import os; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = os.environ["ci_event"]; job = os.environ["job"]; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV

      - name: Send processed report
        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
        with:
          # Slack channel id, channel name, or user id to post message.
          # See also: https://api.slack.com/methods/chat.postMessage#channels
          channel-id: '#${{ inputs.slack_report_channel }}'
          # For posting a rich message using Block Kit
          payload: |
            {
              "blocks": [
                {
                  "type": "header",
                  "text": {
                    "type": "plain_text",
                    "text": "${{ env.title }}"
                  }
                },
                {
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
                    "text": "${{ env.REPORT_TEXT }}"
                  }
                }
              ]
            }
        env:
          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}