name: Process failed tests on: workflow_call: inputs: docker: required: true type: string job: required: true type: string slack_report_channel: required: true type: string ci_event: required: true type: string report_repo_id: required: true type: string commit_sha: required: false type: string pr_number: required: false type: string max_num_runners: required: false type: number default: 4 outputs: is_check_failures_ok: description: "Whether the failure checking infrastructure succeeded" value: ${{ jobs.check_new_failures.result != 'failure' && jobs.process_new_failures_with_commit_info.result != 'failure' }} env: HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes OMP_NUM_THREADS: 8 MKL_NUM_THREADS: 8 RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} TF_FORCE_GPU_ALLOW_GROWTH: true CUDA_VISIBLE_DEVICES: 0,1 permissions: contents: read jobs: setup_check_new_failures: name: "Setup matrix for finding commits" runs-on: ubuntu-22.04 outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} n_runners: ${{ steps.set-matrix.outputs.n_runners }} process: ${{ steps.set-matrix.outputs.process }} steps: - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 continue-on-error: true with: name: ci_results_${{ inputs.job }} path: ci_results_${{ inputs.job }} - name: Set matrix id: set-matrix env: job: ${{ inputs.job }} max_num_runners: ${{ inputs.max_num_runners }} run: | python3 - << 'EOF' import json, os, math print("Script started") job = os.environ["job"] filepath = f"ci_results_{job}/new_failures.json" print(f"Looking for file: {filepath}") print(f"File exists: {os.path.isfile(filepath)}") if not os.path.isfile(filepath): print("File not found, setting process=false") with open(os.environ["GITHUB_OUTPUT"], "a") as f: f.write("process=false\n") exit(0) with open(filepath) as f: reports = json.load(f) print(f"Loaded reports with {len(reports)} models") n_tests = sum( len(model_data.get("failures", model_data).get("single-gpu", [])) for model_data in reports.values() ) print(f"n_tests: {n_tests}") max_num_runners = int(os.environ["max_num_runners"]) TESTS_PER_RUNNER = 10 n_runners = max(1, min(max_num_runners, math.ceil(n_tests / TESTS_PER_RUNNER))) print(f"n_runners: {n_runners}") with open(os.environ["GITHUB_OUTPUT"], "a") as f: f.write(f"matrix={json.dumps(list(range(n_runners)))}\n") f.write(f"n_runners={n_runners}\n") f.write("process=true\n") print("Done") EOF check_new_failures: name: "Find commits for new failing tests" needs: setup_check_new_failures if: needs.setup_check_new_failures.outputs.process == 'true' strategy: matrix: run_idx: ${{ fromJson(needs.setup_check_new_failures.outputs.matrix) }} runs-on: group: aws-g5-4xlarge-cache outputs: process: ${{ needs.setup_check_new_failures.outputs.process }} container: image: ${{ inputs.docker }} options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: ci_results_${{ inputs.job }} path: /transformers/ci_results_${{ inputs.job }} - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 env: ACTIONS_ARTIFACT_MAX_ARTIFACT_COUNT: 2000 with: pattern: setup_values* path: setup_values merge-multiple: true github-token: ${{ secrets.GITHUB_TOKEN }} - name: Prepare some setup values run: | if [ -f setup_values/prev_workflow_run_id.txt ]; then echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV else echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV fi - name: Update clone working-directory: /transformers env: commit_sha: ${{ inputs.commit_sha || github.sha }} run: | git fetch origin "$commit_sha" && git checkout "$commit_sha" - name: Get `START_SHA` working-directory: /transformers/utils env: commit_sha: ${{ inputs.commit_sha || github.sha }} run: | echo "START_SHA=$commit_sha" >> $GITHUB_ENV # This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified) - name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR id: pr_info if: ${{ inputs.pr_number != '' }} uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 env: PR_NUMBER: ${{ inputs.pr_number }} COMMIT_SHA: ${{ inputs.commit_sha }} with: script: | const pull_number = parseInt(process.env.PR_NUMBER, 10); const commit_sha = process.env.COMMIT_SHA; const { data: pr } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, pull_number, }); const { data: merge_commit } = await github.rest.repos.getCommit({ owner: pr.base.repo.owner.login, repo: pr.base.repo.name, ref: commit_sha, }); core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha); # Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow. # (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.) - name: Get `END_SHA` from previous CI runs of the same workflow working-directory: /transformers/utils if: ${{ inputs.pr_number == '' }} env: ACCESS_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} run: | echo "END_SHA=$(TOKEN="$ACCESS_TOKEN" python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV # However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the # parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to # see if a reported failing test is actually ONLY failing on the `merge_commit`. - name: Set `END_SHA` if: ${{ inputs.pr_number != '' }} env: merge_commit_base_sha: ${{ steps.pr_info.outputs.merge_commit_base_sha }} run: | echo "END_SHA=$merge_commit_base_sha" >> $GITHUB_ENV - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - name: NVIDIA-SMI run: | nvidia-smi - name: Environment working-directory: /transformers run: | python3 utils/print_env.py - name: Install pytest-flakefinder run: python3 -m pip install pytest-flakefinder - name: Show installed libraries and their versions working-directory: /transformers run: pip freeze - name: Check failed tests working-directory: /transformers env: job: ${{ inputs.job }} n_runners: ${{ needs.setup_check_new_failures.outputs.n_runners }} run_idx: ${{ matrix.run_idx }} pr_number: ${{ inputs.pr_number }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: python3 utils/check_bad_commit.py --start_commit "$START_SHA" --end_commit "$END_SHA" --file "ci_results_${job}/new_failures.json" --output_file "new_failures_with_bad_commit_${job}_${run_idx}.json" - name: Show results working-directory: /transformers env: job: ${{ inputs.job }} run_idx: ${{ matrix.run_idx }} run: | ls -l "new_failures_with_bad_commit_${job}_${run_idx}.json" cat "new_failures_with_bad_commit_${job}_${run_idx}.json" - name: Upload artifacts uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }} path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json process_new_failures_with_commit_info: name: "process bad commit reports" needs: check_new_failures if: needs.check_new_failures.outputs.process == 'true' runs-on: group: aws-g5-4xlarge-cache container: image: ${{ inputs.docker }} options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: ci_results_${{ inputs.job }} path: /transformers/ci_results_${{ inputs.job }} - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 env: ACTIONS_ARTIFACT_MAX_ARTIFACT_COUNT: 2000 with: pattern: new_failures_with_bad_commit_${{ inputs.job }}* path: /transformers/new_failures_with_bad_commit_${{ inputs.job }} merge-multiple: true github-token: ${{ secrets.GITHUB_TOKEN }} - name: Check files working-directory: /transformers env: job: ${{ inputs.job }} run: | ls -la /transformers ls -la "/transformers/new_failures_with_bad_commit_${job}" # Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners # to further reduce the false positive caused by flaky tests, which requires further processing to merge reports. - name: Merge files shell: bash working-directory: /transformers env: job: ${{ inputs.job }} run: | python3 - << 'EOF' import json import glob import os job = os.environ["job"] pattern = f"/transformers/new_failures_with_bad_commit_{job}/new_failures_with_bad_commit_{job}_*.json" files = sorted(glob.glob(pattern)) if not files: print(f"No files found matching: {pattern}") exit(1) print(f"Found {len(files)} file(s) to merge: {files}") merged = {} for filepath in files: with open(filepath) as f: data = json.load(f) for model, model_results in data.items(): if model not in merged: merged[model] = {} for gpu_type, failures in model_results.items(): if gpu_type not in merged[model]: merged[model][gpu_type] = [] merged[model][gpu_type].extend(failures) print(f"filepath: {filepath}") print(len(data)) output_path = "/transformers/new_failures_with_bad_commit.json" with open(output_path, "w") as f: json.dump(merged, f, indent=4) print(f"Merged {len(files)} file(s) into {output_path}") print(f"n_items: {len(merged)}") print(merged) EOF - name: Update clone working-directory: /transformers env: commit_sha: ${{ inputs.commit_sha || github.sha }} run: | git fetch origin "$commit_sha" && git checkout "$commit_sha" - name: Process report shell: bash working-directory: /transformers env: ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} JOB_NAME: ${{ inputs.job }} REPORT_REPO_ID: ${{ inputs.report_repo_id }} run: | { echo 'REPORT_TEXT<> "$GITHUB_ENV" - name: Show results working-directory: /transformers run: | ls -l new_failures_with_bad_commit.json cat new_failures_with_bad_commit.json - name: Upload artifacts uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: new_failures_with_bad_commit_${{ inputs.job }} path: | /transformers/new_failures_with_bad_commit.json /transformers/new_failures_with_bad_commit_url.txt - name: Prepare Slack report title working-directory: /transformers env: ci_event: ${{ inputs.ci_event }} job: ${{ inputs.job }} run: | pip install slack_sdk echo "title=$(python3 -c 'import sys; import os; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = os.environ["ci_event"]; job = os.environ["job"]; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV - name: Send processed report if: ${{ !endsWith(env.REPORT_TEXT, '{}') }} uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 with: # Slack channel id, channel name, or user id to post message. # See also: https://api.slack.com/methods/chat.postMessage#channels channel-id: '#${{ inputs.slack_report_channel }}' # For posting a rich message using Block Kit payload: | { "blocks": [ { "type": "header", "text": { "type": "plain_text", "text": "${{ env.title }}" } }, { "type": "section", "text": { "type": "mrkdwn", "text": "${{ env.REPORT_TEXT }}" } } ] } env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}