From d649129aab66427eee0988005aae9a33bcdcc009 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Wed, 22 Jan 2025 15:14:46 +0100 Subject: [PATCH 01/21] Run PR gpu utests/relvals on both CUDA and ROCm GPUs --- cleanup-cmssdt | 1 + pr_testing/run-pr-relvals.sh | 2 +- pr_testing/run-pr-unittests.sh | 53 +++++++++++++++++---------------- pr_testing/test_multiple_prs.sh | 29 +++++++++++++++--- report-pull-request-results.py | 12 ++++++-- run-ib-pr-matrix.sh | 4 +-- 6 files changed, 65 insertions(+), 36 deletions(-) diff --git a/cleanup-cmssdt b/cleanup-cmssdt index 4a3f197206a6..d0ffc37bd71f 100755 --- a/cleanup-cmssdt +++ b/cleanup-cmssdt @@ -51,6 +51,7 @@ DIRS="lizard flawfinder invalid-includes cmssw-afs-eos-comparison ubsan_logs ib- DIRS="${DIRS} check_headers valgrind HLT-Validation ib-static-analysis ib-baseline-tests ib-dqm-tests profiling igprof" DIRS="${DIRS} iwyu material-budget das_query build-any-ib check-unused-cmsdist-packages class_versions" DIRS="${DIRS} test-os-alma8 test-os-cs8 test-os-ubi8 test-os-lxplus8 test-os-rhel8 test-os-rocky8 test-os-el8 cms-containers-run-cmssw-test" +DIRS="${DIRS} baseLineComparisonsCUDA baseLineComparisonsROCM" for dir in ${DIRS}; do [ -d ${JENKINS_ARTIFACTS}/$dir ] || continue DIRS_PROCESSED="${DIRS_PROCESSED} ${dir}" diff --git a/pr_testing/run-pr-relvals.sh b/pr_testing/run-pr-relvals.sh index b63b04f34f22..83040c7dd0a3 100755 --- a/pr_testing/run-pr-relvals.sh +++ b/pr_testing/run-pr-relvals.sh @@ -17,7 +17,7 @@ echo "${MATRIX_ARGS}" | tr ';' '\n' | while IFS= read -r args; do if [ $(echo "${args}" | sed 's|.*-l ||;s| .*||' | tr ',' '\n' | grep '^all$' | wc -l) -gt 0 ] ; then OPTS="" case "${TEST_FLAVOR}" in - gpu ) OPTS="-w gpu" ;; + cuda | rocm ) OPTS="-w gpu" ;; high_stats ) ;; nano ) OPTS="-w nano" ;; * ) ;; diff --git a/pr_testing/run-pr-unittests.sh b/pr_testing/run-pr-unittests.sh index a7202e5ec173..a8a0bad4fc33 100755 --- a/pr_testing/run-pr-unittests.sh +++ b/pr_testing/run-pr-unittests.sh @@ -10,57 +10,58 @@ cd $WORKSPACE/${CMSSW_VERSION} CMSSW_PKG_COUNT=$(ls -d $LOCALRT/src/*/* | wc -l) REPORT_OPTS="--report-url ${PR_RESULT_URL} $NO_POST" -rm -f ${RESULTS_DIR}/unittestGPU.txt -mark_commit_status_all_prs 'unittests/gpu' 'pending' -u "${BUILD_URL}" -d "Running tests" || true +rm -f ${RESULTS_DIR}/unittest${GPU_FLAVOR}.txt +mark_commit_status_all_prs "unittests/${GPU_FLAVOR}" 'pending' -u "${BUILD_URL}" -d "Running tests" || true echo '--------------------------------------' -mkdir -p $WORKSPACE/gpuUnitTests +mkdir -p $WORKSPACE/${GPU_FLAVOR}UnitTests let UT_TIMEOUT=7200+${CMSSW_PKG_COUNT}*20 -UTESTS_CMD="USER_UNIT_TESTS=cuda timeout ${UT_TIMEOUT} scram b -v -k -j ${NCPU} unittests " +gpu_t_lc=$(echo ${GPU_T} | tr '[A-Z]' '[a-z]') +UTESTS_CMD="USER_UNIT_TESTS=${gpu_t_lc} timeout ${UT_TIMEOUT} scram b -v -k -j ${NCPU} unittests " echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" scram build echo_LD_LIBRARY_PATH || true scram build -r echo_CXX || true cms_major=$(echo ${CMSSW_IB} | cut -d_ -f2) cms_minor=$(echo ${CMSSW_IB} | cut -d_ -f3) cms_ver="$(echo 00${cms_major} | sed -E 's|^.*(..)$|\1|')$(echo 00${cms_minor} | sed -E 's|^.*(..)$|\1|')" -echo $UTESTS_CMD > $WORKSPACE/gpuUnitTests/log.txt -(eval $UTESTS_CMD && echo 'ALL_OK') > $WORKSPACE/gpuUnitTests/log.txt 2>&1 || true +echo $UTESTS_CMD > $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt +(eval $UTESTS_CMD && echo 'ALL_OK') > $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt 2>&1 || true echo 'END OF UNIT TESTS' echo '--------------------------------------' -TEST_ERRORS=$(grep -ai 'had errors\|recipe for target' $WORKSPACE/gpuUnitTests/log.txt | sed "s|'||g;s|.*recipe for target *||;s|.*unittests_|---> test |;s| failed$| timeout|" || true) -TEST_ERRORS=`grep -ai "had errors" $WORKSPACE/gpuUnitTests/log.txt` || true -GENERAL_ERRORS=`grep -a "ALL_OK" $WORKSPACE/gpuUnitTests/log.txt` || true +TEST_ERRORS=$(grep -ai 'had errors\|recipe for target' $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt | sed "s|'||g;s|.*recipe for target *||;s|.*unittests_|---> test |;s| failed$| timeout|" || true) +TEST_ERRORS=`grep -ai "had errors" $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt` || true +GENERAL_ERRORS=`grep -a "ALL_OK" $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt` || true if [ "X$TEST_ERRORS" != "X" -o "X$GENERAL_ERRORS" = "X" ]; then - echo "Errors in the gpu unit tests" - echo 'GPU_UNIT_TEST_RESULTS;ERROR,GPU Unit Tests,See Log,gpuUnitTests' >> ${RESULTS_DIR}/unittestGPU.txt + echo "Errors in the ${GPU_FLAVOR} unit tests" + echo "${GPU_FLAVOR}_UNIT_TEST_RESULTS;ERROR,GPU Unit Tests,See Log,${GPU_FLAVOR}UnitTests" >> ${RESULTS_DIR}/unittest${GPU_FLAVOR}.txt ALL_OK=false UNIT_TESTS_OK=false - $CMS_BOT_DIR/report-pull-request-results PARSE_GPU_UNIT_TESTS_FAIL -f $WORKSPACE/gpuUnitTests/log.txt --report-file ${RESULTS_DIR}/14-unittestGPU-report.res ${REPORT_OPTS} - echo "GpuUnitTests" > ${RESULTS_DIR}/14-failed.res + $CMS_BOT_DIR/report-pull-request-results PARSE_${GPU_FLAVOR}_UNIT_TESTS_FAIL -f $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt --report-file ${RESULTS_DIR}/14-unittest${GPU_FLAVOR}-report.res ${REPORT_OPTS} + echo "${GPU_FLAVOR}UnitTests" > ${RESULTS_DIR}/14-failed.res else - echo 'GPU_UNIT_TEST_RESULTS;OK,GPU Unit Tests,See Log,gpuUnitTests' >> ${RESULTS_DIR}/unittestGPU.txt + echo "${GPU_FLAVOR}_UNIT_TEST_RESULTS;OK,GPU Unit Tests,See Log,${GPU_FLAVOR}UnitTests" >> ${RESULTS_DIR}/unittest${GPU_FLAVOR}.txt fi -echo "" > $WORKSPACE/gpuUnitTests/success.html -cp $WORKSPACE/gpuUnitTests/success.html $WORKSPACE/gpuUnitTests/failed.html +echo "" > $WORKSPACE/${GPU_FLAVOR}UnitTests/success.html +cp $WORKSPACE/${GPU_FLAVOR}UnitTests/success.html $WORKSPACE/${GPU_FLAVOR}UnitTests/failed.html UT_ERR=false utlog="testing.log" for t in $(find $WORKSPACE/$CMSSW_IB/tmp/${SCRAM_ARCH}/src -name ${utlog} -type f | sed "s|$WORKSPACE/$CMSSW_IB/tmp/${SCRAM_ARCH}/||;s|/${utlog}$||") ; do - mkdir -p $WORKSPACE/gpuUnitTests/${t} - mv $WORKSPACE/$CMSSW_IB/tmp/${SCRAM_ARCH}/${t}/${utlog} $WORKSPACE/gpuUnitTests/${t}/ - if [ $(grep -a '^\-\-\-> test *[^ ]* *succeeded$' $WORKSPACE/gpuUnitTests/${t}/${utlog} | wc -l) -gt 0 ] ; then - echo "${t}
" >> $WORKSPACE/gpuUnitTests/success.html + mkdir -p $WORKSPACE/${GPU_FLAVOR}UnitTests/${t} + mv $WORKSPACE/$CMSSW_IB/tmp/${SCRAM_ARCH}/${t}/${utlog} $WORKSPACE/${GPU_FLAVOR}UnitTests/${t}/ + if [ $(grep -a '^\-\-\-> test *[^ ]* *succeeded$' $WORKSPACE/${GPU_FLAVOR}UnitTests/${t}/${utlog} | wc -l) -gt 0 ] ; then + echo "${t}
" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/success.html else - echo "${t}
" >> $WORKSPACE/gpuUnitTests/failed.html + echo "${t}
" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/failed.html UT_ERR=true fi done -if ! $UT_ERR ; then echo "No unit test failed" >> $WORKSPACE/gpuUnitTests/failed.html ; fi -echo "" >> $WORKSPACE/gpuUnitTests/success.html -echo "" >> $WORKSPACE/gpuUnitTests/failed.html +if ! $UT_ERR ; then echo "No unit test failed" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/failed.html ; fi +echo "" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/success.html +echo "" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/failed.html prepare_upload_results if $UNIT_TESTS_OK ; then - mark_commit_status_all_prs 'unittests/gpu' 'success' -u "${BUILD_URL}" -d "Passed" + mark_commit_status_all_prs "unittests/${GPU_FLAVOR}" 'success' -u "${BUILD_URL}" -d "Passed" else - mark_commit_status_all_prs 'unittests/gpu' 'error' -u "${BUILD_URL}" -d "Some unit tests were failed." + mark_commit_status_all_prs "unittests/${GPU_FLAVOR}" 'error' -u "${BUILD_URL}" -d "Some unit tests were failed." fi diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index 318363049cd4..e90f0077558c 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -1,4 +1,4 @@ -#!/bin/bash -ex + #!/bin/bash -ex # This script will be called by Jenkins job 'ib-run-pr-tests' # and # 1) will merge multiple PRs for multiple repos @@ -162,6 +162,7 @@ if [ $(echo "${CONFIG_LINE}" | grep "PROD_ARCH=1" | wc -l) -gt 0 ] ; then fi fi fi +ALL_GPU_TYPES=("cuda" "rocm") # ---------- # -- MAIN -- @@ -380,6 +381,16 @@ if $DO_COMPARISON ; then grep -v '^\(WORKFLOWS\|MATRIX_ARGS\)=' run-baseline-${BUILD_ID}-01.${ex_type_lc} > run-baseline-${BUILD_ID}-02.${ex_type_lc} echo "WORKFLOWS=-l ${WF_LIST}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc} echo "MATRIX_ARGS=${WF_ARGS}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc} + if [ X"${ex_type_lc}" = X"gpu" ]; then + for GPU_T in ${ALL_GPU_TYPES[@]}; do + cp run-baseline-${BUILD_ID}-01.${ex_type_lc} run-baseline-${BUILD_ID}-01.${GPU_T} + sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-01.${GPU_T} + + cp run-baseline-${BUILD_ID}-02.${ex_type_lc} run-baseline-${BUILD_ID}-02.${GPU_T} + sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-02.${GPU_T} + done + rm run-baseline-${BUILD_ID}-01.${ex_type_lc} run-baseline-${BUILD_ID}-02.${ex_type_lc} + fi done popd send_jenkins_artifacts $WORKSPACE/ib-baseline-tests/ ib-baseline-tests/ @@ -1326,7 +1337,9 @@ if [ "X$BUILD_OK" = Xtrue -a "$RUN_TESTS" = "true" ]; then fi if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^GPU$' | wc -l) -gt 0 -a X"${DISABLE_GPU_TESTS}" != X"true" ] ; then DO_GPU_TESTS=true - mark_commit_status_all_prs 'unittests/gpu' 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start" + for GPU_T in ${ALL_GPU_TYPES[@]} ; do + mark_commit_status_all_prs 'unittests/${GPU_T}' 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start" + done fi if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^HLT_P2_TIMING$' | wc -l) -gt 0 ] ; then if [ $(echo ${ARCHITECTURE} | grep "_amd64_" | wc -l) -gt 0 ] ; then @@ -1468,6 +1481,12 @@ if [ "X$DO_SHORT_MATRIX" = Xtrue ]; then ex_type_lc=$(echo ${ex_type} | tr '[A-Z]' '[a-z]') grep -v '^MATRIX_ARGS=' $WORKSPACE/run-relvals.prop > $WORKSPACE/run-relvals-${ex_type_lc}.prop echo "MATRIX_ARGS=$(get_pr_relval_args $DO_COMPARISON _${ex_type})" >> $WORKSPACE/run-relvals-${ex_type_lc}.prop + if [ "${ex_type_lc}" = "gpu" ]; then + for GPU_T in ${ALL_GPU_TYPES[@]}; do + cp $WORKSPACE/run-relvals-${ex_type_lc}.prop $WORKSPACE/run-relvals-${GPU_T}.prop + done + rm $WORKSPACE/run-relvals-${ex_type_lc}.prop + fi done if [ $(runTheMatrix.py --help | grep '^ *--maxSteps' | wc -l) -eq 0 ] ; then mark_commit_status_all_prs "relvals/input" 'success' -u "${BUILD_URL}" -d "Not ran, runTheMatrix does not support --maxSteps flag" -e @@ -1499,7 +1518,9 @@ if [ "X$DO_ADDON_TESTS" = Xtrue ]; then fi if [ "X$DO_GPU_TESTS" = Xtrue ]; then - cp $WORKSPACE/test-env.txt $WORKSPACE/run-unittests.prop + for GPU_T in ${ALL_GPU_TYPES[@]}; do + cp $WORKSPACE/test-env.txt $WORKSPACE/run-unittests-${GPU_T}.prop + done fi if ${BUILD_EXTERNAL} ; then @@ -1510,7 +1531,7 @@ fi if [ "${DO_PROFILING}" = "true" ] ; then PROFILING_WORKFLOWS=$($CMS_BOT_DIR/cmssw-pr-test-config _PROFILING | tr ',' ' ') - for wf in ${PROFILING_WORKFLOWS};do + for wf in ${PROFILING_WORKFLOWS}; do cp $WORKSPACE/test-env.txt $WORKSPACE/run-profiling-$wf.prop echo "PROFILING_WORKFLOWS=${wf}" >> $WORKSPACE/run-profiling-$wf.prop done diff --git a/report-pull-request-results.py b/report-pull-request-results.py index 487030ceaec9..e229222b0d75 100755 --- a/report-pull-request-results.py +++ b/report-pull-request-results.py @@ -411,7 +411,7 @@ def read_unit_tests_file(unit_tests_file): send_message_pr(message) -def read_gpu_tests_file(unit_tests_file): +def read_gpu_tests_file(unit_tests_file, gpu_flavor="GPU"): errors_found = "" err_cnt = 0 for line in openlog(unit_tests_file): @@ -423,8 +423,8 @@ def read_gpu_tests_file(unit_tests_file): continue errors_found += line message = ( - "\n## GPU Unit Tests\n\nI found %s errors in the following unit tests:\n\n
%s
" - % (err_cnt, errors_found) + "\n## %s Unit Tests\n\nI found %s errors in the following unit tests:\n\n
%s
" + % (gpu_flavor, err_cnt, errors_found) ) send_message_pr(message) @@ -606,6 +606,8 @@ def complain_missing_param(param_name): GITLOG_FILE_BASE_URL = "%s/git-recent-commits.json" % options.report_url GIT_CMS_MERGE_TOPIC_BASE_URL = "%s/git-merge-result" % options.report_url +ACTION = ACTION.upper() + if ACTION == "GET_BASE_MESSAGE": get_base_message() elif ACTION == "PARSE_UNIT_TESTS_FAIL": @@ -630,6 +632,10 @@ def complain_missing_param(param_name): read_material_budget_log_file(options.unit_tests_file) elif ACTION == "MERGE_COMMITS": add_to_report(get_recent_merges_message()) +elif ACTION == "PARSE_CUDA_UNIT_TESTS_FAIL": + read_gpu_tests_file(options.unit_tests_file, "CUDA") +elif ACTION == "PARSE_ROCM_UNIT_TESTS_FAIL": + read_gpu_tests_file(options.unit_tests_file, "ROCm") elif ACTION == "PARSE_GPU_UNIT_TESTS_FAIL": read_gpu_tests_file(options.unit_tests_file) else: diff --git a/run-ib-pr-matrix.sh b/run-ib-pr-matrix.sh index 75e6d86747a8..4789e53797b8 100755 --- a/run-ib-pr-matrix.sh +++ b/run-ib-pr-matrix.sh @@ -9,7 +9,7 @@ if [ "${CHECK_WORKFLOWS}" = "true" ] ; then send_jenkins_artifacts ${WORKSPACE}/workflows-${BUILD_ID}.log ${ARTIFACT_DIR}/workflows-${BUILD_ID}.log OPTS="" case "${TEST_FLAVOR}" in - gpu ) OPTS="-w gpu" ;; + cuda | rocm ) OPTS="-w gpu" ;; high_stats ) ;; nano ) OPTS="-w nano" ;; * ) ;; @@ -49,7 +49,7 @@ pushd "$WORKSPACE/matrix-results" CMD_OPTS="" if ${PRODUCTION_RELEASE} && cmsDriver.py --help | grep -q '\-\-maxmem_profile' ; then CMD_OPTS="--maxmem_profile" ; fi case "${TEST_FLAVOR}" in - gpu ) MATRIX_ARGS="-w gpu ${MATRIX_ARGS}" ;; + cuda | rocm ) MATRIX_ARGS="-w gpu ${MATRIX_ARGS}" ;; high_stats ) CMD_OPTS="-n 500" ; MATRIX_ARGS="-i all ${MATRIX_ARGS}" ;; threading ) MATRIX_ARGS="-i all -t 4 ${MATRIX_ARGS}" ; let NJOBS=(${NJOBS}/4)+1 ;; nano ) MATRIX_ARGS="-w nano -i all ${MATRIX_ARGS}" ;; From 6f71537b920a38424b6b4e9954789f892345a2a3 Mon Sep 17 00:00:00 2001 From: iarspider Date: Tue, 4 Feb 2025 16:11:28 +0100 Subject: [PATCH 02/21] Update test_multiple_prs.sh --- pr_testing/test_multiple_prs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index e90f0077558c..7c41fac6cd79 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -1,4 +1,4 @@ - #!/bin/bash -ex +#!/bin/bash -ex # This script will be called by Jenkins job 'ib-run-pr-tests' # and # 1) will merge multiple PRs for multiple repos From 52e4bbedaa2919fb11c883af7613ae3721c05469 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Fri, 31 Jan 2025 14:46:45 +0100 Subject: [PATCH 03/21] Kill stuck pr testing jobs --- parse_jenkins_builds.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/parse_jenkins_builds.py b/parse_jenkins_builds.py index 4d267eb04663..92d342deee6b 100755 --- a/parse_jenkins_builds.py +++ b/parse_jenkins_builds.py @@ -169,6 +169,27 @@ def grep(filename, pattern, verbose=False): payload["wait_time"] = current_time - queue_time payload["start_time"] = 0 + kill_index = 0 + + # Abort stuck rocm jobs + if job_name in ("ib-run-pr-unittests", "ib-run-pr-relvals") and reason.endswith("-offline") and (payload["wait_time"] / 1000 / 60 > 60): + params = element["params"].strip().split("\n") + main_params = "" + other_params = [] + for _ in params: + k, v = _.split("=") + if k == "PULL_REQUEST": + main_params = _ + else: + other_params.append(_) + + with open("abort-{0}.prop".format(kill_index), "w") as f: + f.write("JENKINS_PROJECT_TO_KILL={0}\n".format(job_name)) + f.write("JENKINS_PROJECT_PARAMS={0}\n".format(main_params)) + f.write("EXTRA_PARAMS={0}\n".format(";".join(other_params))) + + kill_index += 1 + unique_id = ( JENKINS_PREFIX + ":/build/builds/" + job_name + "/" + str(queue_id) ) # Not a real path From 2afa23f89bb1007a31398f74888a2257e7e3bbaf Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Tue, 4 Feb 2025 16:49:12 +0100 Subject: [PATCH 04/21] Black --- parse_jenkins_builds.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parse_jenkins_builds.py b/parse_jenkins_builds.py index 92d342deee6b..9a20986ae9cd 100755 --- a/parse_jenkins_builds.py +++ b/parse_jenkins_builds.py @@ -172,12 +172,16 @@ def grep(filename, pattern, verbose=False): kill_index = 0 # Abort stuck rocm jobs - if job_name in ("ib-run-pr-unittests", "ib-run-pr-relvals") and reason.endswith("-offline") and (payload["wait_time"] / 1000 / 60 > 60): + if ( + job_name in ("ib-run-pr-unittests", "ib-run-pr-relvals") + and reason.endswith("-offline") + and (payload["wait_time"] / 1000 / 60 > 60) + ): params = element["params"].strip().split("\n") main_params = "" other_params = [] for _ in params: - k, v = _.split("=") + k, v = _.split("=") if k == "PULL_REQUEST": main_params = _ else: From 65617b63e6756b4fc239fcaf4fdaaf4e7703f632 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Tue, 4 Feb 2025 16:53:50 +0100 Subject: [PATCH 05/21] Only kill rocm jobs --- parse_jenkins_builds.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/parse_jenkins_builds.py b/parse_jenkins_builds.py index 9a20986ae9cd..ec57c14779de 100755 --- a/parse_jenkins_builds.py +++ b/parse_jenkins_builds.py @@ -187,12 +187,14 @@ def grep(filename, pattern, verbose=False): else: other_params.append(_) - with open("abort-{0}.prop".format(kill_index), "w") as f: - f.write("JENKINS_PROJECT_TO_KILL={0}\n".format(job_name)) - f.write("JENKINS_PROJECT_PARAMS={0}\n".format(main_params)) - f.write("EXTRA_PARAMS={0}\n".format(";".join(other_params))) - - kill_index += 1 + if "GPU_FLAVOR=rocm" in other_params or "TEST_FLAVOR=rocm" in other_params: + with open("abort-{0}.prop".format(kill_index), "w") as f: + f.write("JENKINS_PROJECT_TO_KILL={0}\n".format(job_name)) + f.write("JENKINS_PROJECT_PARAMS={0}\n".format(main_params)) + f.write("EXTRA_PARAMS={0}\n".format(";".join(other_params))) + + kill_index += 1 + # TODO: set commit status unique_id = ( JENKINS_PREFIX + ":/build/builds/" + job_name + "/" + str(queue_id) From 5cdc8af5150bf4bbeef26f978141654daa58469c Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Wed, 5 Feb 2025 09:09:00 +0100 Subject: [PATCH 06/21] Update commit status for killed jobs --- parse_jenkins_builds.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/parse_jenkins_builds.py b/parse_jenkins_builds.py index ec57c14779de..1db0cc3404ab 100755 --- a/parse_jenkins_builds.py +++ b/parse_jenkins_builds.py @@ -6,6 +6,13 @@ from es_utils import send_payload, get_payload, resend_payload, get_payload_wscroll from cmsutils import epoch2week +from github_utils import ( + api_rate_limits, + mark_commit_status, + get_combined_statuses, + get_pr_latest_commit, +) + JENKINS_PREFIX = "jenkins" try: JENKINS_PREFIX = os.environ["JENKINS_URL"].strip("/").split("/")[-1] @@ -173,18 +180,21 @@ def grep(filename, pattern, verbose=False): # Abort stuck rocm jobs if ( - job_name in ("ib-run-pr-unittests", "ib-run-pr-relvals") + job_name in ("ib-run-pr-unittests", "ib-run-pr-relvals", "ib-run-baseline") and reason.endswith("-offline") and (payload["wait_time"] / 1000 / 60 > 60) ): params = element["params"].strip().split("\n") main_params = "" other_params = [] + context = "" for _ in params: k, v = _.split("=") if k == "PULL_REQUEST": main_params = _ else: + if k == "CONTEXT_PREFIX": + context = v other_params.append(_) if "GPU_FLAVOR=rocm" in other_params or "TEST_FLAVOR=rocm" in other_params: @@ -194,7 +204,20 @@ def grep(filename, pattern, verbose=False): f.write("EXTRA_PARAMS={0}\n".format(";".join(other_params))) kill_index += 1 - # TODO: set commit status + + repository, pr = main_params.split("#", 1) + commit = get_pr_latest_commit(pr, repository) + mark_commit_status( + commit, + repository, + context, + "error", + "", + "Timed out waiting for ROCm node", + reset=False, + ) + + continue unique_id = ( JENKINS_PREFIX + ":/build/builds/" + job_name + "/" + str(queue_id) From aed9e8bc3a5ba1b07a6599031f583ca8e69b1785 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Wed, 5 Feb 2025 10:36:17 +0100 Subject: [PATCH 07/21] Changes from review --- parse_jenkins_builds.py | 41 ++++++++++++++++++++++++--------- pr_testing/test_multiple_prs.sh | 2 +- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/parse_jenkins_builds.py b/parse_jenkins_builds.py index 1db0cc3404ab..66251e77fe24 100755 --- a/parse_jenkins_builds.py +++ b/parse_jenkins_builds.py @@ -5,6 +5,7 @@ import subprocess from es_utils import send_payload, get_payload, resend_payload, get_payload_wscroll from cmsutils import epoch2week +import urllib.request from github_utils import ( api_rate_limits, @@ -188,13 +189,21 @@ def grep(filename, pattern, verbose=False): main_params = "" other_params = [] context = "" + upload_unique_id = "" + pull_request = "" + commit = "" + for _ in params: k, v = _.split("=") if k == "PULL_REQUEST": main_params = _ + pull_request = v else: if k == "CONTEXT_PREFIX": context = v + if k == "UPLOAD_UNIQ_ID": + upload_unique_id = v + other_params.append(_) if "GPU_FLAVOR=rocm" in other_params or "TEST_FLAVOR=rocm" in other_params: @@ -205,17 +214,27 @@ def grep(filename, pattern, verbose=False): kill_index += 1 - repository, pr = main_params.split("#", 1) - commit = get_pr_latest_commit(pr, repository) - mark_commit_status( - commit, - repository, - context, - "error", - "", - "Timed out waiting for ROCm node", - reset=False, - ) + repository, pr = pull_request.split("#", 1) + + if context: + with urllib.request.urlopen( + "http://localhost/SDT/jenkins-artifacts/pull-request-integration/{0}/prs_commits.txt".format( + upload_unique_id + ) + ) as f: + commits = f.read().decode("ascii", "ignore").splitlines() + + for _ in commits: + if _.startswith(pull_request): + commit = _.split("='", 1)[1] + break + + with open("commit-status-{0}.prop", "w") as f: + f.write("REPOSITORY={0}\n".format(repository)) + f.write("PULL_REQUEST={0}\n".format(commit)) + f.write("CONTEXT={0}\n".format(context)) + f.write("STATUS=error\n") + f.write("STATUS_MESSAGE=Timed out waiting for ROCm node\n") continue diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index 7c41fac6cd79..f67619e51493 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -1338,7 +1338,7 @@ if [ "X$BUILD_OK" = Xtrue -a "$RUN_TESTS" = "true" ]; then if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^GPU$' | wc -l) -gt 0 -a X"${DISABLE_GPU_TESTS}" != X"true" ] ; then DO_GPU_TESTS=true for GPU_T in ${ALL_GPU_TYPES[@]} ; do - mark_commit_status_all_prs 'unittests/${GPU_T}' 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start" + mark_commit_status_all_prs "unittests/${GPU_T}" 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start" done fi if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^HLT_P2_TIMING$' | wc -l) -gt 0 ] ; then From 8708cd529a7f29f04cd82a1f3bd7e661f4bc748a Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Fri, 7 Feb 2025 15:07:00 +0100 Subject: [PATCH 08/21] Fix --- pr_testing/test_multiple_prs.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index f67619e51493..af481ed4beef 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -1520,6 +1520,7 @@ fi if [ "X$DO_GPU_TESTS" = Xtrue ]; then for GPU_T in ${ALL_GPU_TYPES[@]}; do cp $WORKSPACE/test-env.txt $WORKSPACE/run-unittests-${GPU_T}.prop + echo "GPU_FLAVOR=${GPU_T}" >> $WORKSPACE/run-unittests-${GPU_T}.prop done fi @@ -1546,3 +1547,4 @@ if [ "${DO_HLT_P2_INTEGRATION}" = "true" ] ; then fi rm -f $WORKSPACE/test-env.txt + From 176cbffee8174c7c25413f15f82f3ae80307a0d6 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Wed, 12 Feb 2025 16:18:54 +0100 Subject: [PATCH 09/21] Revert changes to parse_jenkins_builds.py (split into #2440) --- parse_jenkins_builds.py | 69 ----------------------------------------- 1 file changed, 69 deletions(-) diff --git a/parse_jenkins_builds.py b/parse_jenkins_builds.py index 66251e77fe24..4d267eb04663 100755 --- a/parse_jenkins_builds.py +++ b/parse_jenkins_builds.py @@ -5,14 +5,6 @@ import subprocess from es_utils import send_payload, get_payload, resend_payload, get_payload_wscroll from cmsutils import epoch2week -import urllib.request - -from github_utils import ( - api_rate_limits, - mark_commit_status, - get_combined_statuses, - get_pr_latest_commit, -) JENKINS_PREFIX = "jenkins" try: @@ -177,67 +169,6 @@ def grep(filename, pattern, verbose=False): payload["wait_time"] = current_time - queue_time payload["start_time"] = 0 - kill_index = 0 - - # Abort stuck rocm jobs - if ( - job_name in ("ib-run-pr-unittests", "ib-run-pr-relvals", "ib-run-baseline") - and reason.endswith("-offline") - and (payload["wait_time"] / 1000 / 60 > 60) - ): - params = element["params"].strip().split("\n") - main_params = "" - other_params = [] - context = "" - upload_unique_id = "" - pull_request = "" - commit = "" - - for _ in params: - k, v = _.split("=") - if k == "PULL_REQUEST": - main_params = _ - pull_request = v - else: - if k == "CONTEXT_PREFIX": - context = v - if k == "UPLOAD_UNIQ_ID": - upload_unique_id = v - - other_params.append(_) - - if "GPU_FLAVOR=rocm" in other_params or "TEST_FLAVOR=rocm" in other_params: - with open("abort-{0}.prop".format(kill_index), "w") as f: - f.write("JENKINS_PROJECT_TO_KILL={0}\n".format(job_name)) - f.write("JENKINS_PROJECT_PARAMS={0}\n".format(main_params)) - f.write("EXTRA_PARAMS={0}\n".format(";".join(other_params))) - - kill_index += 1 - - repository, pr = pull_request.split("#", 1) - - if context: - with urllib.request.urlopen( - "http://localhost/SDT/jenkins-artifacts/pull-request-integration/{0}/prs_commits.txt".format( - upload_unique_id - ) - ) as f: - commits = f.read().decode("ascii", "ignore").splitlines() - - for _ in commits: - if _.startswith(pull_request): - commit = _.split("='", 1)[1] - break - - with open("commit-status-{0}.prop", "w") as f: - f.write("REPOSITORY={0}\n".format(repository)) - f.write("PULL_REQUEST={0}\n".format(commit)) - f.write("CONTEXT={0}\n".format(context)) - f.write("STATUS=error\n") - f.write("STATUS_MESSAGE=Timed out waiting for ROCm node\n") - - continue - unique_id = ( JENKINS_PREFIX + ":/build/builds/" + job_name + "/" + str(queue_id) ) # Not a real path From ddce8533e43d3b29bde6994d9fe2b81b79dad94b Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Mon, 24 Feb 2025 10:10:14 +0100 Subject: [PATCH 10/21] Support selecting which GPU flavors to run on --- gpu_flavors.txt | 2 ++ pr_testing/run-pr-relvals.sh | 21 +++++++++++++++++++-- pr_testing/test_multiple_prs.sh | 10 +++++----- process_pr.py | 24 +++++++++++++++++++++--- 4 files changed, 47 insertions(+), 10 deletions(-) create mode 100644 gpu_flavors.txt diff --git a/gpu_flavors.txt b/gpu_flavors.txt new file mode 100644 index 000000000000..8af5d069d204 --- /dev/null +++ b/gpu_flavors.txt @@ -0,0 +1,2 @@ +cuda +rocm \ No newline at end of file diff --git a/pr_testing/run-pr-relvals.sh b/pr_testing/run-pr-relvals.sh index 83040c7dd0a3..1377f18350dc 100755 --- a/pr_testing/run-pr-relvals.sh +++ b/pr_testing/run-pr-relvals.sh @@ -1,4 +1,19 @@ #!/bin/bash -ex +function is_in_array() { + local value="$1" + shift + local array=("$@") + + for item in "${array[@]}"; do + if [[ "$item" == "$value" ]]; then + return 0 # Found match + fi + done + return 1 # No match +} + +readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt + source $(dirname $0)/setup-pr-test-env.sh GH_CONTEXT="relvals" GH_COMP_CONTEXT="comparison" @@ -17,10 +32,12 @@ echo "${MATRIX_ARGS}" | tr ';' '\n' | while IFS= read -r args; do if [ $(echo "${args}" | sed 's|.*-l ||;s| .*||' | tr ',' '\n' | grep '^all$' | wc -l) -gt 0 ] ; then OPTS="" case "${TEST_FLAVOR}" in - cuda | rocm ) OPTS="-w gpu" ;; high_stats ) ;; nano ) OPTS="-w nano" ;; - * ) ;; + * ) if is_in_array "${TEST_FLAVOR}" "${ALL_GPU_TYPES[@]}" ; then + OPTS="-w gpu" + fi + ;; esac ALL_WFS=$(runTheMatrix.py -n ${OPTS} ${args} | grep -v ' workflows ' | grep '^[1-9][0-9]*\(.[0-9][0-9]*\|\)\s' | sed 's| .*||' | tr '\n' ',' | sed 's|,$||') args=$(echo "${args}" | sed "s|all|${ALL_WFS}|") diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index af481ed4beef..ad4c639aeaf4 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -162,7 +162,6 @@ if [ $(echo "${CONFIG_LINE}" | grep "PROD_ARCH=1" | wc -l) -gt 0 ] ; then fi fi fi -ALL_GPU_TYPES=("cuda" "rocm") # ---------- # -- MAIN -- @@ -382,7 +381,7 @@ if $DO_COMPARISON ; then echo "WORKFLOWS=-l ${WF_LIST}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc} echo "MATRIX_ARGS=${WF_ARGS}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc} if [ X"${ex_type_lc}" = X"gpu" ]; then - for GPU_T in ${ALL_GPU_TYPES[@]}; do + for GPU_T in ${ENABLE_GPU_FLAVORS[@]}; do cp run-baseline-${BUILD_ID}-01.${ex_type_lc} run-baseline-${BUILD_ID}-01.${GPU_T} sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-01.${GPU_T} @@ -1337,7 +1336,7 @@ if [ "X$BUILD_OK" = Xtrue -a "$RUN_TESTS" = "true" ]; then fi if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^GPU$' | wc -l) -gt 0 -a X"${DISABLE_GPU_TESTS}" != X"true" ] ; then DO_GPU_TESTS=true - for GPU_T in ${ALL_GPU_TYPES[@]} ; do + for GPU_T in ${ENABLE_GPU_FLAVORS[@]} ; do mark_commit_status_all_prs "unittests/${GPU_T}" 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start" done fi @@ -1482,8 +1481,9 @@ if [ "X$DO_SHORT_MATRIX" = Xtrue ]; then grep -v '^MATRIX_ARGS=' $WORKSPACE/run-relvals.prop > $WORKSPACE/run-relvals-${ex_type_lc}.prop echo "MATRIX_ARGS=$(get_pr_relval_args $DO_COMPARISON _${ex_type})" >> $WORKSPACE/run-relvals-${ex_type_lc}.prop if [ "${ex_type_lc}" = "gpu" ]; then - for GPU_T in ${ALL_GPU_TYPES[@]}; do + for GPU_T in ${ENABLE_GPU_FLAVORS[@]}; do cp $WORKSPACE/run-relvals-${ex_type_lc}.prop $WORKSPACE/run-relvals-${GPU_T}.prop + echo "GPU_FLAVOR=${GPU_T}" >> $WORKSPACE/run-relvals-${GPU_T}.prop done rm $WORKSPACE/run-relvals-${ex_type_lc}.prop fi @@ -1518,7 +1518,7 @@ if [ "X$DO_ADDON_TESTS" = Xtrue ]; then fi if [ "X$DO_GPU_TESTS" = Xtrue ]; then - for GPU_T in ${ALL_GPU_TYPES[@]}; do + for GPU_T in ${ENABLE_GPU_FLAVORS[@]}; do cp $WORKSPACE/test-env.txt $WORKSPACE/run-unittests-${GPU_T}.prop echo "GPU_FLAVOR=${GPU_T}" >> $WORKSPACE/run-unittests-${GPU_T}.prop done diff --git a/process_pr.py b/process_pr.py index 87afb240712c..e426fbea80a1 100644 --- a/process_pr.py +++ b/process_pr.py @@ -143,7 +143,8 @@ def format(s, **kwds): REGEX_IGNORE_FILE_COUNT = r"\+file-count" TEST_WAIT_GAP = 720 ALL_CHECK_FUNCTIONS = None -EXTRA_RELVALS_TESTS = ["threading", "gpu", "high-stats", "nano"] +GPU_FLAVORS = open(join(dirname(__file__), "gpu_flavors.txt"), "r").read().splitlines() +EXTRA_RELVALS_TESTS = ["threading", "gpu", "high-stats", "nano"] # + GPU_FLAVORS EXTRA_RELVALS_TESTS_OPTS = "_" + "|_".join(EXTRA_RELVALS_TESTS) EXTRA_TESTS = ( "|".join(EXTRA_RELVALS_TESTS) @@ -168,7 +169,7 @@ def format(s, **kwds): "disable_poison": ["true|false", "DISABLE_POISON"], "use_ib_tag": ["true|false", "USE_IB_TAG"], "baseline": ["self|default", "USE_BASELINE"], - "set_env": ["[A-Z][A-Z0-9_]+(\s*,\s*[A-Z][A-Z0-9_]+|)*", "CMSBOT_SET_ENV"], + "set_env": [r"[A-Z][A-Z0-9_]+(\s*,\s*[A-Z][A-Z0-9_]+|)*", "CMSBOT_SET_ENV"], "skip_test(s|)": [format(r"(%(tests)s)(\s*,\s*(%(tests)s))*", tests=SKIP_TESTS), "SKIP_TESTS"], "dry_run": ["true|false", "DRY_RUN"], "jenkins_(slave|node)": [JENKINS_NODES, "RUN_ON_SLAVE"], @@ -608,6 +609,7 @@ def check_ignore_bot_tests(first_line, *args): def check_enable_bot_tests(first_line, *args): tests = first_line.upper().replace(" ", "") + enable_gpus = [] if "NONE" in tests: tests = "NONE" return tests, None @@ -967,6 +969,7 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F ok_too_many_files = False warned_too_many_files = False is_draft_pr = False + enabled_gpu_flavors = [] if issue.pull_request: pr = repo.get_pull(prId) @@ -1315,7 +1318,7 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F elif re.match(REGEX_EX_ENABLE_TESTS, first_line, re.I): comment_emoji = "-1" if valid_commenter: - enable_tests, ignore = check_enable_bot_tests(first_line.split(" ", 1)[-1]) + enable_tests, _ = check_enable_bot_tests(first_line.split(" ", 1)[-1]) comment_emoji = "+1" elif re.match(r"^allow\s+@([^ ]+)\s+test\s+rights$", first_line, re.I): comment_emoji = "-1" @@ -1375,6 +1378,7 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F first_line, comment_lines, repository ) if test_params_m: + # Error in parameters test_params_msg = str(comment.id) + ":" + test_params_m test_params_comment = comment continue @@ -1589,6 +1593,20 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F set_comment_emoji_cache(dryRun, bot_cache, comment, repository) # end of parsing comments section + + # Extract enabled GPU flavors + for gpu_lc in GPU_FLAVORS: + gpu_uc = gpu_lc.upper() + if gpu_uc in enable_tests or gpu_uc in global_test_params["ENABLE_BOT_TESTS"]: + enabled_gpu_flavors.append(gpu_lc) + + if not enabled_gpu_flavors and ( + "GPU" in enable_tests or "GPU" in global_test_params["ENABLE_BOT_TESTS"] + ): + enabled_gpu_flavors = GPU_FLAVORS + + global_test_params["ENABLE_GPU_FLAVORS"] = " ".join(enabled_gpu_flavors) + # Check if it needs to be automatically closed. if mustClose: if issue.state == "open": From 2418fe59b6591ca934313d1c2e22f8185541fcc1 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Mon, 24 Feb 2025 10:18:08 +0100 Subject: [PATCH 11/21] Fix tests; don't set ENABLE_GPU_FLAVORS if gpu is not enabled --- process_pr.py | 7 ++++--- tests/test-requirements.txt | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/process_pr.py b/process_pr.py index e426fbea80a1..8b32ddf07de7 100644 --- a/process_pr.py +++ b/process_pr.py @@ -1597,15 +1597,16 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F # Extract enabled GPU flavors for gpu_lc in GPU_FLAVORS: gpu_uc = gpu_lc.upper() - if gpu_uc in enable_tests or gpu_uc in global_test_params["ENABLE_BOT_TESTS"]: + if gpu_uc in enable_tests or gpu_uc in global_test_params.get("ENABLE_BOT_TESTS", ""): enabled_gpu_flavors.append(gpu_lc) if not enabled_gpu_flavors and ( - "GPU" in enable_tests or "GPU" in global_test_params["ENABLE_BOT_TESTS"] + "GPU" in enable_tests or "GPU" in global_test_params.get("ENABLE_BOT_TESTS", "") ): enabled_gpu_flavors = GPU_FLAVORS - global_test_params["ENABLE_GPU_FLAVORS"] = " ".join(enabled_gpu_flavors) + if enabled_gpu_flavors: + global_test_params["ENABLE_GPU_FLAVORS"] = " ".join(enabled_gpu_flavors) # Check if it needs to be automatically closed. if mustClose: diff --git a/tests/test-requirements.txt b/tests/test-requirements.txt index 7b208eb2d000..329a3c9f9777 100644 --- a/tests/test-requirements.txt +++ b/tests/test-requirements.txt @@ -4,3 +4,4 @@ pytest>=5.3 pytest-cov>=2.8 PyGithub==1.56 pyyaml +urllib3<2.3.0 From 63a4b3d44e943d0da5bf25a2063894bfc3975ca6 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Mon, 24 Feb 2025 11:16:42 +0100 Subject: [PATCH 12/21] Fix for missing CMS_BOT_DIR --- pr_testing/run-pr-relvals.sh | 2 +- pr_testing/test_multiple_prs.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pr_testing/run-pr-relvals.sh b/pr_testing/run-pr-relvals.sh index 1377f18350dc..40e6ff2fdbb2 100755 --- a/pr_testing/run-pr-relvals.sh +++ b/pr_testing/run-pr-relvals.sh @@ -12,9 +12,9 @@ function is_in_array() { return 1 # No match } +source $(dirname $0)/setup-pr-test-env.sh readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt -source $(dirname $0)/setup-pr-test-env.sh GH_CONTEXT="relvals" GH_COMP_CONTEXT="comparison" UC_TEST_FLAVOR=$(echo ${TEST_FLAVOR} | tr '[a-z]' '[A-Z]') diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index ad4c639aeaf4..924401fdeada 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -382,13 +382,13 @@ if $DO_COMPARISON ; then echo "MATRIX_ARGS=${WF_ARGS}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc} if [ X"${ex_type_lc}" = X"gpu" ]; then for GPU_T in ${ENABLE_GPU_FLAVORS[@]}; do - cp run-baseline-${BUILD_ID}-01.${ex_type_lc} run-baseline-${BUILD_ID}-01.${GPU_T} + cp run-baseline-${BUILD_ID}-01.gpu run-baseline-${BUILD_ID}-01.${GPU_T} sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-01.${GPU_T} - cp run-baseline-${BUILD_ID}-02.${ex_type_lc} run-baseline-${BUILD_ID}-02.${GPU_T} + cp run-baseline-${BUILD_ID}-02.gpu run-baseline-${BUILD_ID}-02.${GPU_T} sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-02.${GPU_T} done - rm run-baseline-${BUILD_ID}-01.${ex_type_lc} run-baseline-${BUILD_ID}-02.${ex_type_lc} + rm run-baseline-${BUILD_ID}-01.gpu run-baseline-${BUILD_ID}-02.gpu fi done popd From f0974fa033f57e346523a0c661d5cbf3ac3ff84a Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Mon, 24 Feb 2025 13:27:27 +0100 Subject: [PATCH 13/21] Pass ENABLE_GPU_FLAVORS from ib-schedule-pr-tests to ib-run-pr-tests --- pr-schedule-tests | 1 + 1 file changed, 1 insertion(+) diff --git a/pr-schedule-tests b/pr-schedule-tests index 45d081f9a8cb..335f864805e6 100755 --- a/pr-schedule-tests +++ b/pr-schedule-tests @@ -118,6 +118,7 @@ if [ $(echo $CONFIG_LINE | tr ';' '\n' | grep SCRAM_ARCH= | wc -l) -eq 1 ] ; the echo "CONTEXT_PREFIX=${CONTEXT_PREFIX}" >> $OUTPUT_FILE echo "PROFILING_WORKFLOWS=${PROFILING_WORKFLOWS}" >> $OUTPUT_FILE echo "BUILD_VERBOSE=${BUILD_VERBOSE}" >> $OUTPUT_FILE + echo "ENABLE_GPU_FLAVORS=${ENABLE_GPU_FLAVORS}" >> $OUTPUT_FILE CMSSW_QUEUE="${RELEASE_QUEUE}" mark_commit_status_all_prs "${PR_COMMIT_STATUS}" 'pending' -d "Tests scheduled ${RELEASE_FORMAT}/${SCRAM_ARCH}" -u 'https://cmssdt.cern.ch/jenkins/job/ib-run-pr-tests/' || true COMMIT_STATUS_CONTEXT="unknown/release" From 5b59afef8a015040dbcb354f1f932a360f7fa451 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Tue, 25 Feb 2025 15:25:38 +0100 Subject: [PATCH 14/21] Avoid running maxmem profiling for ROCm RelVals --- pr_testing/test_multiple_prs.sh | 5 +++++ run-ib-pr-matrix.sh | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index 924401fdeada..cac8b03cfb0e 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -1484,6 +1484,11 @@ if [ "X$DO_SHORT_MATRIX" = Xtrue ]; then for GPU_T in ${ENABLE_GPU_FLAVORS[@]}; do cp $WORKSPACE/run-relvals-${ex_type_lc}.prop $WORKSPACE/run-relvals-${GPU_T}.prop echo "GPU_FLAVOR=${GPU_T}" >> $WORKSPACE/run-relvals-${GPU_T}.prop + if [ "$GPU_T" = "rocm" ]; then + grep -v "^RUN_THE_MATRIX_CMD_OPTS" $WORKSPACE/run-relvals-${GPU_T}.prop > $WORKSPACE/$WORKSPACE/run-relvals-${GPU_T}.tmp + echo "RUN_THE_MATRIX_CMD_OPTS=${EXTRA_MATRIX_COMMAND_ARGS}" >> $WORKSPACE/$WORKSPACE/run-relvals-${GPU_T}.tmp + mv $WORKSPACE/$WORKSPACE/run-relvals-${GPU_T}.tmp $WORKSPACE/$WORKSPACE/run-relvals-${GPU_T}.prop + fi done rm $WORKSPACE/run-relvals-${ex_type_lc}.prop fi diff --git a/run-ib-pr-matrix.sh b/run-ib-pr-matrix.sh index 4789e53797b8..e39286c75db5 100755 --- a/run-ib-pr-matrix.sh +++ b/run-ib-pr-matrix.sh @@ -47,7 +47,11 @@ UC_TEST_FLAVOR=$(echo ${TEST_FLAVOR} | tr '[a-z]' '[A-Z]') pushd "$WORKSPACE/matrix-results" NJOBS=$(nproc) CMD_OPTS="" - if ${PRODUCTION_RELEASE} && cmsDriver.py --help | grep -q '\-\-maxmem_profile' ; then CMD_OPTS="--maxmem_profile" ; fi + if ${PRODUCTION_RELEASE} && cmsDriver.py --help | grep -q '\-\-maxmem_profile' ; then + if [ "TEST_FLAVOR" != "rocm" ]; then + CMD_OPTS="--maxmem_profile" + fi + fi case "${TEST_FLAVOR}" in cuda | rocm ) MATRIX_ARGS="-w gpu ${MATRIX_ARGS}" ;; high_stats ) CMD_OPTS="-n 500" ; MATRIX_ARGS="-i all ${MATRIX_ARGS}" ;; From 2ddc6aebe0b2d8380d0d0feb6dd618af7c1e37f2 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Tue, 25 Feb 2025 15:39:31 +0100 Subject: [PATCH 15/21] Fix --- process_pr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/process_pr.py b/process_pr.py index 8b32ddf07de7..2bb9212101fe 100644 --- a/process_pr.py +++ b/process_pr.py @@ -144,7 +144,7 @@ def format(s, **kwds): TEST_WAIT_GAP = 720 ALL_CHECK_FUNCTIONS = None GPU_FLAVORS = open(join(dirname(__file__), "gpu_flavors.txt"), "r").read().splitlines() -EXTRA_RELVALS_TESTS = ["threading", "gpu", "high-stats", "nano"] # + GPU_FLAVORS +EXTRA_RELVALS_TESTS = ["threading", "gpu", "high-stats", "nano"] + GPU_FLAVORS EXTRA_RELVALS_TESTS_OPTS = "_" + "|_".join(EXTRA_RELVALS_TESTS) EXTRA_TESTS = ( "|".join(EXTRA_RELVALS_TESTS) @@ -609,7 +609,6 @@ def check_ignore_bot_tests(first_line, *args): def check_enable_bot_tests(first_line, *args): tests = first_line.upper().replace(" ", "") - enable_gpus = [] if "NONE" in tests: tests = "NONE" return tests, None From e7c5ac837ade396f04f81e370def462d67aad49b Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Wed, 26 Feb 2025 09:37:44 +0100 Subject: [PATCH 16/21] Always add 'GPU' to ENABLE_BOT_TESTS; remove gpu types from ENABLE_BOT_TESTS --- process_pr.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/process_pr.py b/process_pr.py index 2bb9212101fe..72feffdd8a24 100644 --- a/process_pr.py +++ b/process_pr.py @@ -143,12 +143,13 @@ def format(s, **kwds): REGEX_IGNORE_FILE_COUNT = r"\+file-count" TEST_WAIT_GAP = 720 ALL_CHECK_FUNCTIONS = None -GPU_FLAVORS = open(join(dirname(__file__), "gpu_flavors.txt"), "r").read().splitlines() -EXTRA_RELVALS_TESTS = ["threading", "gpu", "high-stats", "nano"] + GPU_FLAVORS +ALL_GPU_FLAVORS = open(join(dirname(__file__), "gpu_flavors.txt"), "r").read().splitlines() +EXTRA_RELVALS_TESTS = ["threading", "gpu", "high-stats", "nano"] EXTRA_RELVALS_TESTS_OPTS = "_" + "|_".join(EXTRA_RELVALS_TESTS) EXTRA_TESTS = ( "|".join(EXTRA_RELVALS_TESTS) - + "|hlt_p2_integration|hlt_p2_timing|profiling|none|multi-microarchs" + + "|hlt_p2_integration|hlt_p2_timing|profiling|none|multi-microarchs|" + + "|".join(ALL_GPU_FLAVORS) ) SKIP_TESTS = "|".join(["static", "header"]) ENABLE_TEST_PTRN = "enable(_test(s|)|)" @@ -1593,8 +1594,8 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F # end of parsing comments section - # Extract enabled GPU flavors - for gpu_lc in GPU_FLAVORS: + # Extract enabled GPU flavors and remove them from global_test_params and enable_tests + for gpu_lc in ALL_GPU_FLAVORS: gpu_uc = gpu_lc.upper() if gpu_uc in enable_tests or gpu_uc in global_test_params.get("ENABLE_BOT_TESTS", ""): enabled_gpu_flavors.append(gpu_lc) @@ -1602,10 +1603,23 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F if not enabled_gpu_flavors and ( "GPU" in enable_tests or "GPU" in global_test_params.get("ENABLE_BOT_TESTS", "") ): - enabled_gpu_flavors = GPU_FLAVORS + enabled_gpu_flavors = ALL_GPU_FLAVORS if enabled_gpu_flavors: global_test_params["ENABLE_GPU_FLAVORS"] = " ".join(enabled_gpu_flavors) + for gpu_lc in enabled_gpu_flavors: + gpu_uc = gpu_lc.upper() + enable_tests = enable_tests.replace(gpu_uc, "") + if "ENABLE_BOT_TESTS" in global_test_params: + global_test_params["ENABLE_BOT_TESTS"] = global_test_params[ + "ENABLE_BOT_TESTS" + ].replace(gpu_uc, "") + + enable_tests = re.sub(r"\s+", " ", enable_tests) + if "ENABLE_BOT_TESTS" in global_test_params: + global_test_params["ENABLE_BOT_TESTS"] = re.sub( + r"\s+", " ", global_test_params["ENABLE_BOT_TESTS"] + ) # Check if it needs to be automatically closed. if mustClose: From 4c3133ef7f0e8eb4bbc9450e545da62de9cb7688 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Wed, 26 Feb 2025 16:12:57 +0100 Subject: [PATCH 17/21] Changes from review --- cmssw-pr-test-config | 6 +++ pr-schedule-tests | 1 - pr_testing/test_multiple_prs.sh | 40 ++++++++--------- process_pr.py | 43 ++++++------------- run-ib-pr-matrix.sh | 26 +++++++++-- .../TestProcessPr.test_abort.json | 2 +- .../TestProcessPr.test_draft_pr_ready.json | 2 +- ...estProcessPr.test_draft_pr_start_test.json | 2 +- .../TestProcessPr.test_run_test_params.json | 2 +- .../TestProcessPr.test_start_tests.json | 2 +- .../TestProcessPr.test_tests_passed.json | 2 +- 11 files changed, 65 insertions(+), 63 deletions(-) diff --git a/cmssw-pr-test-config b/cmssw-pr-test-config index f09c3c6cbb38..62ae60179326 100755 --- a/cmssw-pr-test-config +++ b/cmssw-pr-test-config @@ -20,6 +20,12 @@ elif [ "$CMSSW_VER" -ge 1300 ] ; then else PR_TEST_MATRIX_EXTRAS_GPU=11634.586,11634.587 fi +if [ X"$PR_TEST_MATRIX_EXTRAS_GPU" != X"" ]; then + for GPU_T in ${ALL_GPU_TYPES[@]} ; do + GPU_T_UC = $(echo ${GPU_T} | tr '[a-z]' '[A-Z]') + eval "PR_TEST_MATRIX_EXTRAS_${GPU_T_UC}=$PR_TEST_MATRIX_EXTRAS_GPU" + done +fi PR_TEST_MATRIX_EXTRAS_PROFILING=29834.21,13034.21 PR_TEST_MATRIX_EXTRAS_HIGH_STATS=35034.0 PR_TEST_MATRIX_EXTRAS_NANO=all diff --git a/pr-schedule-tests b/pr-schedule-tests index 335f864805e6..45d081f9a8cb 100755 --- a/pr-schedule-tests +++ b/pr-schedule-tests @@ -118,7 +118,6 @@ if [ $(echo $CONFIG_LINE | tr ';' '\n' | grep SCRAM_ARCH= | wc -l) -eq 1 ] ; the echo "CONTEXT_PREFIX=${CONTEXT_PREFIX}" >> $OUTPUT_FILE echo "PROFILING_WORKFLOWS=${PROFILING_WORKFLOWS}" >> $OUTPUT_FILE echo "BUILD_VERBOSE=${BUILD_VERBOSE}" >> $OUTPUT_FILE - echo "ENABLE_GPU_FLAVORS=${ENABLE_GPU_FLAVORS}" >> $OUTPUT_FILE CMSSW_QUEUE="${RELEASE_QUEUE}" mark_commit_status_all_prs "${PR_COMMIT_STATUS}" 'pending' -d "Tests scheduled ${RELEASE_FORMAT}/${SCRAM_ARCH}" -u 'https://cmssdt.cern.ch/jenkins/job/ib-run-pr-tests/' || true COMMIT_STATUS_CONTEXT="unknown/release" diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index cac8b03cfb0e..71ab75ae2960 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -79,6 +79,22 @@ function process_changed_files() { sort -u "$directlyChangedFiles" $WORKSPACE/indirectly-changed-files.txt > "$allChangedFiles" } +function is_in_array() { + local value="$1" + shift + local array=("$@") + + for item in "${array[@]}"; do + if [[ "$item" == "$value" ]]; then + return 0 # Found match + fi + done + return 1 # No match +} + +readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt +export ALL_GPU_TYPES +ALL_GPU_TYPES_UC=( $(echo ${ALL_GPU_TYPES[@]} | tr '[a-z]' '[A-Z]') ) # Constants echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} || true ls ${LD_LIBRARY_PATH} || true @@ -91,7 +107,7 @@ PR_TESTING_DIR=${CMS_BOT_DIR}/pr_testing COMMON=${CMS_BOT_DIR}/common CONFIG_MAP=$CMS_BOT_DIR/config.map [ "${USE_IB_TAG}" != "true" ] && export USE_IB_TAG=false -[ "${EXTRA_RELVALS_TESTS}" = "" ] && EXTRA_RELVALS_TESTS="GPU THREADING HIGH_STATS NANO" +[ "${EXTRA_RELVALS_TESTS}" = "" ] && EXTRA_RELVALS_TESTS="THREADING HIGH_STATS NANO ${ALL_GPU_TYPES_UC[@]}" EXTRA_RELVALS_TESTS=$(echo ${EXTRA_RELVALS_TESTS} | tr ' ' '\n' | grep -v THREADING | tr '\n' ' ') # --- # doc: Input variable @@ -380,16 +396,6 @@ if $DO_COMPARISON ; then grep -v '^\(WORKFLOWS\|MATRIX_ARGS\)=' run-baseline-${BUILD_ID}-01.${ex_type_lc} > run-baseline-${BUILD_ID}-02.${ex_type_lc} echo "WORKFLOWS=-l ${WF_LIST}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc} echo "MATRIX_ARGS=${WF_ARGS}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc} - if [ X"${ex_type_lc}" = X"gpu" ]; then - for GPU_T in ${ENABLE_GPU_FLAVORS[@]}; do - cp run-baseline-${BUILD_ID}-01.gpu run-baseline-${BUILD_ID}-01.${GPU_T} - sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-01.${GPU_T} - - cp run-baseline-${BUILD_ID}-02.gpu run-baseline-${BUILD_ID}-02.${GPU_T} - sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-02.${GPU_T} - done - rm run-baseline-${BUILD_ID}-01.gpu run-baseline-${BUILD_ID}-02.gpu - fi done popd send_jenkins_artifacts $WORKSPACE/ib-baseline-tests/ ib-baseline-tests/ @@ -1480,18 +1486,6 @@ if [ "X$DO_SHORT_MATRIX" = Xtrue ]; then ex_type_lc=$(echo ${ex_type} | tr '[A-Z]' '[a-z]') grep -v '^MATRIX_ARGS=' $WORKSPACE/run-relvals.prop > $WORKSPACE/run-relvals-${ex_type_lc}.prop echo "MATRIX_ARGS=$(get_pr_relval_args $DO_COMPARISON _${ex_type})" >> $WORKSPACE/run-relvals-${ex_type_lc}.prop - if [ "${ex_type_lc}" = "gpu" ]; then - for GPU_T in ${ENABLE_GPU_FLAVORS[@]}; do - cp $WORKSPACE/run-relvals-${ex_type_lc}.prop $WORKSPACE/run-relvals-${GPU_T}.prop - echo "GPU_FLAVOR=${GPU_T}" >> $WORKSPACE/run-relvals-${GPU_T}.prop - if [ "$GPU_T" = "rocm" ]; then - grep -v "^RUN_THE_MATRIX_CMD_OPTS" $WORKSPACE/run-relvals-${GPU_T}.prop > $WORKSPACE/$WORKSPACE/run-relvals-${GPU_T}.tmp - echo "RUN_THE_MATRIX_CMD_OPTS=${EXTRA_MATRIX_COMMAND_ARGS}" >> $WORKSPACE/$WORKSPACE/run-relvals-${GPU_T}.tmp - mv $WORKSPACE/$WORKSPACE/run-relvals-${GPU_T}.tmp $WORKSPACE/$WORKSPACE/run-relvals-${GPU_T}.prop - fi - done - rm $WORKSPACE/run-relvals-${ex_type_lc}.prop - fi done if [ $(runTheMatrix.py --help | grep '^ *--maxSteps' | wc -l) -eq 0 ] ; then mark_commit_status_all_prs "relvals/input" 'success' -u "${BUILD_URL}" -d "Not ran, runTheMatrix does not support --maxSteps flag" -e diff --git a/process_pr.py b/process_pr.py index 72feffdd8a24..7f23d1e5ebf2 100644 --- a/process_pr.py +++ b/process_pr.py @@ -144,12 +144,11 @@ def format(s, **kwds): TEST_WAIT_GAP = 720 ALL_CHECK_FUNCTIONS = None ALL_GPU_FLAVORS = open(join(dirname(__file__), "gpu_flavors.txt"), "r").read().splitlines() -EXTRA_RELVALS_TESTS = ["threading", "gpu", "high-stats", "nano"] +EXTRA_RELVALS_TESTS = ["threading", "gpu", "high-stats", "nano"] + ALL_GPU_FLAVORS EXTRA_RELVALS_TESTS_OPTS = "_" + "|_".join(EXTRA_RELVALS_TESTS) EXTRA_TESTS = ( "|".join(EXTRA_RELVALS_TESTS) - + "|hlt_p2_integration|hlt_p2_timing|profiling|none|multi-microarchs|" - + "|".join(ALL_GPU_FLAVORS) + + "|hlt_p2_integration|hlt_p2_timing|profiling|none|multi-microarchs" ) SKIP_TESTS = "|".join(["static", "header"]) ENABLE_TEST_PTRN = "enable(_test(s|)|)" @@ -969,7 +968,6 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F ok_too_many_files = False warned_too_many_files = False is_draft_pr = False - enabled_gpu_flavors = [] if issue.pull_request: pr = repo.get_pull(prId) @@ -1594,32 +1592,19 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F # end of parsing comments section - # Extract enabled GPU flavors and remove them from global_test_params and enable_tests - for gpu_lc in ALL_GPU_FLAVORS: - gpu_uc = gpu_lc.upper() - if gpu_uc in enable_tests or gpu_uc in global_test_params.get("ENABLE_BOT_TESTS", ""): - enabled_gpu_flavors.append(gpu_lc) + # Extract enabled GPU flavors and remove them from enable_tests + new_enable_tests = [] + enabled_gpu_flavors = set() + for test in enable_tests.split(): + if test == "GPU": + enabled_gpu_flavors.update([x.upper() for x in ALL_GPU_FLAVORS]) + elif test.lower() in ALL_GPU_FLAVORS: + enabled_gpu_flavors.add(test) + else: + new_enable_tests.append(test) - if not enabled_gpu_flavors and ( - "GPU" in enable_tests or "GPU" in global_test_params.get("ENABLE_BOT_TESTS", "") - ): - enabled_gpu_flavors = ALL_GPU_FLAVORS - - if enabled_gpu_flavors: - global_test_params["ENABLE_GPU_FLAVORS"] = " ".join(enabled_gpu_flavors) - for gpu_lc in enabled_gpu_flavors: - gpu_uc = gpu_lc.upper() - enable_tests = enable_tests.replace(gpu_uc, "") - if "ENABLE_BOT_TESTS" in global_test_params: - global_test_params["ENABLE_BOT_TESTS"] = global_test_params[ - "ENABLE_BOT_TESTS" - ].replace(gpu_uc, "") - - enable_tests = re.sub(r"\s+", " ", enable_tests) - if "ENABLE_BOT_TESTS" in global_test_params: - global_test_params["ENABLE_BOT_TESTS"] = re.sub( - r"\s+", " ", global_test_params["ENABLE_BOT_TESTS"] - ) + new_enable_tests.extend(list(enabled_gpu_flavors)) + enable_tests = new_enable_tests # Check if it needs to be automatically closed. if mustClose: diff --git a/run-ib-pr-matrix.sh b/run-ib-pr-matrix.sh index e39286c75db5..4379119dcfe5 100755 --- a/run-ib-pr-matrix.sh +++ b/run-ib-pr-matrix.sh @@ -1,6 +1,20 @@ #!/bin/sh -ex +function is_in_array() { + local value="$1" + shift + local array=("$@") + + for item in "${array[@]}"; do + if [[ "$item" == "$value" ]]; then + return 0 # Found match + fi + done + return 1 # No match +} + TEST_FLAVOR=$1 CMS_BOT_DIR=$(cd $(dirname $0) >/dev/null 2>&1; pwd -P) +readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt ARTIFACT_DIR="ib-baseline-tests/${RELEASE_FORMAT}/${ARCHITECTURE}/${REAL_ARCH}/matrix${TEST_FLAVOR}-results" source $CMS_BOT_DIR/jenkins-artifacts #Run on any machine to see which workflows should be run @@ -9,10 +23,12 @@ if [ "${CHECK_WORKFLOWS}" = "true" ] ; then send_jenkins_artifacts ${WORKSPACE}/workflows-${BUILD_ID}.log ${ARTIFACT_DIR}/workflows-${BUILD_ID}.log OPTS="" case "${TEST_FLAVOR}" in - cuda | rocm ) OPTS="-w gpu" ;; high_stats ) ;; nano ) OPTS="-w nano" ;; - * ) ;; + * ) if is_in_array "${TEST_FLAVOR}" "${ALL_GPU_TYPES[@]}" ; then + OPTS="-w gpu" + fi + ;; esac REL_WFS="" if has_jenkins_artifacts ${ARTIFACT_DIR} -d ; then @@ -53,12 +69,14 @@ pushd "$WORKSPACE/matrix-results" fi fi case "${TEST_FLAVOR}" in - cuda | rocm ) MATRIX_ARGS="-w gpu ${MATRIX_ARGS}" ;; high_stats ) CMD_OPTS="-n 500" ; MATRIX_ARGS="-i all ${MATRIX_ARGS}" ;; threading ) MATRIX_ARGS="-i all -t 4 ${MATRIX_ARGS}" ; let NJOBS=(${NJOBS}/4)+1 ;; nano ) MATRIX_ARGS="-w nano -i all ${MATRIX_ARGS}" ;; input ) MATRIX_ARGS="-i all --maxSteps=2 ${MATRIX_ARGS}" ; CMD_OPTS="-n 1 --prefix ${CMS_BOT_DIR}/pr_testing/retry-command.sh" ; export CMS_BOT_RETRY_COUNT=3 ;; - * ) ;; + * ) if is_in_array "${TEST_FLAVOR}" "${ALL_GPU_TYPES[@]}" ; then + MATRIX_ARGS="-w gpu ${MATRIX_ARGS}" + fi + ;; esac [ $(runTheMatrix.py --help | grep 'job-reports' | wc -l) -gt 0 ] && MATRIX_ARGS="--job-reports $MATRIX_ARGS" [ -f ${CMSSW_RELEASE_BASE}/src/Validation/Performance/python/TimeMemoryJobReport.py ] && CMD_OPTS="${CMD_OPTS} --customise Validation/Performance/TimeMemoryJobReport.customiseWithTimeMemoryJobReport" diff --git a/tests/PRActionData/TestProcessPr.test_abort.json b/tests/PRActionData/TestProcessPr.test_abort.json index 5a0c691d6979..da16f4850fbb 100644 --- a/tests/PRActionData/TestProcessPr.test_abort.json +++ b/tests/PRActionData/TestProcessPr.test_abort.json @@ -110,7 +110,7 @@ "SKIP_TESTS": "header,static", "PULL_REQUESTS": "iarspider-cmssw/cmssw#17 cms-sw/cms-bot#2134", "RELEASE_FORMAT": "CMSSW_14_1_CPP20_X", - "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO", + "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO CUDA ROCM", "CONTEXT_PREFIX": "cms/17" } } diff --git a/tests/PRActionData/TestProcessPr.test_draft_pr_ready.json b/tests/PRActionData/TestProcessPr.test_draft_pr_ready.json index d67a0f460bc1..03ff35b06662 100644 --- a/tests/PRActionData/TestProcessPr.test_draft_pr_ready.json +++ b/tests/PRActionData/TestProcessPr.test_draft_pr_ready.json @@ -70,7 +70,7 @@ "filename": "trigger-tests-iarspider-cmssw-cmssw-21.properties", "data": { "PULL_REQUESTS": "iarspider-cmssw/cmssw#21", - "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO", + "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO CUDA ROCM", "CONTEXT_PREFIX": "cms/21" } } diff --git a/tests/PRActionData/TestProcessPr.test_draft_pr_start_test.json b/tests/PRActionData/TestProcessPr.test_draft_pr_start_test.json index 3ca01ee22741..3821f343465f 100644 --- a/tests/PRActionData/TestProcessPr.test_draft_pr_start_test.json +++ b/tests/PRActionData/TestProcessPr.test_draft_pr_start_test.json @@ -70,7 +70,7 @@ "filename": "trigger-tests-iarspider-cmssw-cmssw-21.properties", "data": { "PULL_REQUESTS": "iarspider-cmssw/cmssw#21", - "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO", + "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO CUDA ROCM", "CONTEXT_PREFIX": "cms/21" } } diff --git a/tests/PRActionData/TestProcessPr.test_run_test_params.json b/tests/PRActionData/TestProcessPr.test_run_test_params.json index 3ebc060870c1..0439808cf2dd 100644 --- a/tests/PRActionData/TestProcessPr.test_run_test_params.json +++ b/tests/PRActionData/TestProcessPr.test_run_test_params.json @@ -123,7 +123,7 @@ "SKIP_TESTS": "header,static", "PULL_REQUESTS": "iarspider-cmssw/cmssw#17 cms-sw/cms-bot#2134", "RELEASE_FORMAT": "CMSSW_14_1_CPP20_X", - "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO", + "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO CUDA ROCM", "CONTEXT_PREFIX": "cms/17" } } diff --git a/tests/PRActionData/TestProcessPr.test_start_tests.json b/tests/PRActionData/TestProcessPr.test_start_tests.json index e0b8ba29ce7a..33bf9b6fa4c9 100644 --- a/tests/PRActionData/TestProcessPr.test_start_tests.json +++ b/tests/PRActionData/TestProcessPr.test_start_tests.json @@ -88,7 +88,7 @@ "filename": "trigger-tests-iarspider-cmssw-cmssw-17.properties", "data": { "PULL_REQUESTS": "iarspider-cmssw/cmssw#17", - "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO", + "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO CUDA ROCM", "CONTEXT_PREFIX": "cms/17" } } diff --git a/tests/PRActionData/TestProcessPr.test_tests_passed.json b/tests/PRActionData/TestProcessPr.test_tests_passed.json index f48c3585c1d6..50a8647aac0e 100644 --- a/tests/PRActionData/TestProcessPr.test_tests_passed.json +++ b/tests/PRActionData/TestProcessPr.test_tests_passed.json @@ -87,7 +87,7 @@ "filename": "trigger-tests-iarspider-cmssw-cmssw-17.properties", "data": { "PULL_REQUESTS": "iarspider-cmssw/cmssw#17", - "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO", + "EXTRA_RELVALS_TESTS": "THREADING GPU HIGH_STATS NANO CUDA ROCM", "CONTEXT_PREFIX": "cms/17" } } From e4f9cfbb6c840658f5f9f180460909d25d9912a5 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Thu, 27 Feb 2025 11:08:10 +0100 Subject: [PATCH 18/21] Fix unittest scheduling --- pr_testing/test_multiple_prs.sh | 18 +++++++++++++----- process_pr.py | 14 -------------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index 71ab75ae2960..c0ca12ad338b 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -94,7 +94,15 @@ function is_in_array() { readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt export ALL_GPU_TYPES -ALL_GPU_TYPES_UC=( $(echo ${ALL_GPU_TYPES[@]} | tr '[a-z]' '[A-Z]') ) + +declare -a ENABLE_GPU_FLAVORS +for ex_type in ${EXTRA_RELVALS_TESTS} ; do + [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep "^${ex_type}$" | wc -l) -gt 0 ] || continue + ex_type_lc=$(echo $ex_type | tr '[A-Z]' '[a-z]') + if is_in_array "$ex_type_lc" "${ALL_GPU_TYPES[@]}" ; then + ENABLE_GPU_FLAVORS+=( $ex_type ) + fi +done # Constants echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} || true ls ${LD_LIBRARY_PATH} || true @@ -107,7 +115,7 @@ PR_TESTING_DIR=${CMS_BOT_DIR}/pr_testing COMMON=${CMS_BOT_DIR}/common CONFIG_MAP=$CMS_BOT_DIR/config.map [ "${USE_IB_TAG}" != "true" ] && export USE_IB_TAG=false -[ "${EXTRA_RELVALS_TESTS}" = "" ] && EXTRA_RELVALS_TESTS="THREADING HIGH_STATS NANO ${ALL_GPU_TYPES_UC[@]}" +[ "${EXTRA_RELVALS_TESTS}" = "" ] && EXTRA_RELVALS_TESTS="THREADING HIGH_STATS NANO $(echo ${ALL_GPU_TYPES[@]} | tr '[a-z]' '[A-Z]')" EXTRA_RELVALS_TESTS=$(echo ${EXTRA_RELVALS_TESTS} | tr ' ' '\n' | grep -v THREADING | tr '\n' ' ') # --- # doc: Input variable @@ -383,6 +391,7 @@ if $DO_COMPARISON ; then fi for ex_type in ${EXTRA_RELVALS_TESTS} ; do + [ $ex_type = "GPU" ] && continue [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep "^${ex_type}$" | wc -l) -gt 0 ] || continue WF_LIST=$(get_pr_baseline_worklflow "_${ex_type}") [ "$WF_LIST" != "" ] || continue @@ -1342,9 +1351,6 @@ if [ "X$BUILD_OK" = Xtrue -a "$RUN_TESTS" = "true" ]; then fi if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^GPU$' | wc -l) -gt 0 -a X"${DISABLE_GPU_TESTS}" != X"true" ] ; then DO_GPU_TESTS=true - for GPU_T in ${ENABLE_GPU_FLAVORS[@]} ; do - mark_commit_status_all_prs "unittests/${GPU_T}" 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start" - done fi if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^HLT_P2_TIMING$' | wc -l) -gt 0 ] ; then if [ $(echo ${ARCHITECTURE} | grep "_amd64_" | wc -l) -gt 0 ] ; then @@ -1480,6 +1486,7 @@ if [ "X$DO_SHORT_MATRIX" = Xtrue ]; then fi if $PRODUCTION_RELEASE ; then for ex_type in ${EXTRA_RELVALS_TESTS} ; do + [ $ex_type = "GPU" ] && continue [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep "^${ex_type}$" | wc -l) -gt 0 ] || continue WF_LIST=$(get_pr_baseline_worklflow "_${ex_type}") [ "$WF_LIST" != "" ] || continue @@ -1520,6 +1527,7 @@ if [ "X$DO_GPU_TESTS" = Xtrue ]; then for GPU_T in ${ENABLE_GPU_FLAVORS[@]}; do cp $WORKSPACE/test-env.txt $WORKSPACE/run-unittests-${GPU_T}.prop echo "GPU_FLAVOR=${GPU_T}" >> $WORKSPACE/run-unittests-${GPU_T}.prop + mark_commit_status_all_prs "unittests/${GPU_T}" 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start" done fi diff --git a/process_pr.py b/process_pr.py index 7f23d1e5ebf2..78882515e99b 100644 --- a/process_pr.py +++ b/process_pr.py @@ -1592,20 +1592,6 @@ def process_pr(repo_config, gh, repo, issue, dryRun, cmsbuild_user=None, force=F # end of parsing comments section - # Extract enabled GPU flavors and remove them from enable_tests - new_enable_tests = [] - enabled_gpu_flavors = set() - for test in enable_tests.split(): - if test == "GPU": - enabled_gpu_flavors.update([x.upper() for x in ALL_GPU_FLAVORS]) - elif test.lower() in ALL_GPU_FLAVORS: - enabled_gpu_flavors.add(test) - else: - new_enable_tests.append(test) - - new_enable_tests.extend(list(enabled_gpu_flavors)) - enable_tests = new_enable_tests - # Check if it needs to be automatically closed. if mustClose: if issue.state == "open": From 1136d9f37f6f934a5802db506e12736a91e07e32 Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Thu, 27 Feb 2025 11:33:14 +0100 Subject: [PATCH 19/21] Set CMS_BOT_DIR earlier --- pr_testing/test_multiple_prs.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index c0ca12ad338b..54dbd8b5ed24 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -92,6 +92,9 @@ function is_in_array() { return 1 # No match } +SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" # Absolute path to script +CMS_BOT_DIR=$(dirname ${SCRIPTPATH}) # To get CMS_BOT dir path + readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt export ALL_GPU_TYPES @@ -106,8 +109,6 @@ done # Constants echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} || true ls ${LD_LIBRARY_PATH} || true -SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" # Absolute path to script -CMS_BOT_DIR=$(dirname ${SCRIPTPATH}) # To get CMS_BOT dir path export SCRAM_PREFIX_PATH=${CMS_BOT_DIR}/das-utils source ${CMS_BOT_DIR}/cmsrep.sh CACHED=${WORKSPACE}/CACHED # Where cached PR metada etc are kept From 5793e8619daed429a36052268d5088053bba4c3f Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Thu, 27 Feb 2025 12:15:46 +0100 Subject: [PATCH 20/21] Populate ALL_GPY_TYPES inside cmssw-pr-test-config --- cmssw-pr-test-config | 3 ++- pr_testing/test_multiple_prs.sh | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmssw-pr-test-config b/cmssw-pr-test-config index 62ae60179326..72540cdfedf1 100755 --- a/cmssw-pr-test-config +++ b/cmssw-pr-test-config @@ -1,4 +1,5 @@ #!/bin/bash +readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt CMSSW_VER=$CMSSW_VERSION [ "${CMSSW_VER}" != "" ] || CMSSW_VER=${RELEASE_FORMAT} CMSSW_MAJOR=0 @@ -22,7 +23,7 @@ else fi if [ X"$PR_TEST_MATRIX_EXTRAS_GPU" != X"" ]; then for GPU_T in ${ALL_GPU_TYPES[@]} ; do - GPU_T_UC = $(echo ${GPU_T} | tr '[a-z]' '[A-Z]') + GPU_T_UC=$(echo ${GPU_T} | tr '[a-z]' '[A-Z]') eval "PR_TEST_MATRIX_EXTRAS_${GPU_T_UC}=$PR_TEST_MATRIX_EXTRAS_GPU" done fi diff --git a/pr_testing/test_multiple_prs.sh b/pr_testing/test_multiple_prs.sh index 54dbd8b5ed24..6c5bee702b75 100755 --- a/pr_testing/test_multiple_prs.sh +++ b/pr_testing/test_multiple_prs.sh @@ -96,7 +96,6 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" # Absolute path to script CMS_BOT_DIR=$(dirname ${SCRIPTPATH}) # To get CMS_BOT dir path readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt -export ALL_GPU_TYPES declare -a ENABLE_GPU_FLAVORS for ex_type in ${EXTRA_RELVALS_TESTS} ; do From 0dabd52ce0f7b1be70fd971d715fc9a155991fff Mon Sep 17 00:00:00 2001 From: Ivan Razumov Date: Thu, 27 Feb 2025 13:46:05 +0100 Subject: [PATCH 21/21] Fix --- cmssw-pr-test-config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmssw-pr-test-config b/cmssw-pr-test-config index 72540cdfedf1..ef0eb2e61810 100755 --- a/cmssw-pr-test-config +++ b/cmssw-pr-test-config @@ -1,5 +1,6 @@ #!/bin/bash -readarray -t ALL_GPU_TYPES < ${CMS_BOT_DIR}/gpu_flavors.txt +SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" +readarray -t ALL_GPU_TYPES < ${SCRIPTPATH}/gpu_flavors.txt CMSSW_VER=$CMSSW_VERSION [ "${CMSSW_VER}" != "" ] || CMSSW_VER=${RELEASE_FORMAT} CMSSW_MAJOR=0