Skip to content

Commit 7c6316c

Browse files
restore error checking to workflow and tweak some jobs to fail, to test the features
1 parent a3d4733 commit 7c6316c

File tree

5 files changed

+98
-15
lines changed

5 files changed

+98
-15
lines changed

tests/rt.conf

+14-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
2+
3+
# FIXME: CHANGES TO THIS FILE SHOULD NOT BE MERGED TO DEVELOP
4+
5+
16
### RT.CONF FORMATTING ###
27
# COMPILE Line ( Items separated by a | )
38
# Item 1: COMPILE - This tells rt.conf the following information is to be used in setting up a compile job
@@ -147,6 +152,9 @@ RUN | rrfs_v1beta |
147152
RUN | rrfs_v1nssl | | baseline |
148153
RUN | rrfs_v1nssl_nohailnoccn | | baseline |
149154

155+
# This variant of rrfs_v1beta should always fail.
156+
RUN | rrfs_v1beta_failing | | baseline |
157+
150158
COMPILE | csawmg | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16_csawmg,FV3_GFS_v16_ras | - noaacloud | fv3 |
151159
RUN | control_csawmg | - noaacloud | baseline |
152160
RUN | control_ras | - noaacloud acorn | baseline |
@@ -158,7 +166,9 @@ RUN | control_csawmg | + hera hercules
158166
COMPILE | wam | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v17_p8 -D32BIT=ON -DMULTI_GASES=ON | - noaacloud | fv3 |
159167
RUN | control_wam | - noaacloud | baseline |
160168

161-
COMPILE | atm_faster_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v17_p8,FV3_GFS_v15_thompson_mynn_lam3km -D32BIT=ON -DFASTER=ON | | fv3 |
169+
# Removing -DFASTER=ON here ensures results change, but the test runs. The workflow jobs should complete
170+
# for all three of these tests, but the results should change.
171+
COMPILE | atm_faster_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v17_p8,FV3_GFS_v15_thompson_mynn_lam3km -D32BIT=ON | | fv3 |
162172
RUN | control_p8_faster | - noaacloud | baseline |
163173
RUN | regional_control_faster | | baseline |
164174

@@ -235,7 +245,9 @@ COMPILE | rrfs_dyn64_phy32_debug | intel | -DAPP=ATM -DCCPP_SUITES=FV3_RAP,FV3_H
235245
RUN | rap_control_dyn64_phy32_debug | - noaacloud | baseline |
236246

237247
### HAFS tests ###
238-
COMPILE | hafsw | intel | -DAPP=HAFSW -DMOVING_NEST=ON -DCCPP_SUITES=FV3_HAFS_v1_gfdlmp_tedmf,FV3_HAFS_v1_gfdlmp_tedmf_nonsst,FV3_HAFS_v1_thompson_tedmf_gfdlsf,FV3_global_nest_v1 -D32BIT=ON | | fv3 |
248+
249+
# The --invalid-argument ensures the compile job will fail. The workflow should not submit the tests jobs for this compile job.
250+
COMPILE | hafsw | intel | -DAPP=HAFSW --invalid-argument -DMOVING_NEST=ON -DCCPP_SUITES=FV3_HAFS_v1_gfdlmp_tedmf,FV3_HAFS_v1_gfdlmp_tedmf_nonsst,FV3_HAFS_v1_thompson_tedmf_gfdlsf,FV3_global_nest_v1 -D32BIT=ON | | fv3 |
239251
RUN | hafs_regional_atm | | baseline |
240252
RUN | hafs_regional_atm_thompson_gfdlsf | | baseline |
241253
RUN | hafs_regional_atm_ocn | | baseline |

tests/rt_utils.sh

+2-6
Original file line numberDiff line numberDiff line change
@@ -373,10 +373,6 @@ check_results() {
373373

374374
if [[ ${test_status} = 'FAIL' ]]; then
375375
echo "${TEST_ID} failed in check_result" >> "${PATHRT}/fail_test_${TEST_ID}"
376-
377-
if [[ ${ROCOTO} = true || ${ECFLOW} == true ]]; then
378-
exit 1
379-
fi
380376
fi
381377
}
382378

@@ -427,7 +423,7 @@ rocoto_create_compile_task() {
427423

428424
cat << EOF >> "${ROCOTO_XML}"
429425
<task name="compile_${COMPILE_ID}" maxtries="${ROCOTO_COMPILE_MAXTRIES:-3}">
430-
<command>&PATHRT;/run_compile.sh &PATHRT; &RUNDIR_ROOT; "${MAKE_OPT}" ${COMPILE_ID} 2>&amp;1 | tee &LOG;/compile_${COMPILE_ID}.log</command>
426+
<command>bash -c 'set -xe -o pipefail ; &PATHRT;/run_compile.sh &PATHRT; &RUNDIR_ROOT; "${MAKE_OPT}" ${COMPILE_ID} 2>&amp;1 | tee &LOG;/compile_${COMPILE_ID}.log'</command>
431427
<jobname>compile_${COMPILE_ID}</jobname>
432428
<account>${ACCNR}</account>
433429
<queue>${COMPILE_QUEUE}</queue>
@@ -471,7 +467,7 @@ rocoto_create_run_task() {
471467
cat << EOF >> "${ROCOTO_XML}"
472468
<task name="${TEST_ID}${RT_SUFFIX}" maxtries="${ROCOTO_TEST_MAXTRIES:-3}">
473469
<dependency> ${DEP_STRING} </dependency>
474-
<command>&PATHRT;/run_test.sh &PATHRT; &RUNDIR_ROOT; ${TEST_NAME} ${TEST_ID} ${COMPILE_ID} 2>&amp;1 | tee &LOG;/run_${TEST_ID}${RT_SUFFIX}.log </command>
470+
<command>bash -c 'set -xe -o pipefail ; &PATHRT;/run_test.sh &PATHRT; &RUNDIR_ROOT; ${TEST_NAME} ${TEST_ID} ${COMPILE_ID} 2>&amp;1 | tee &LOG;/run_${TEST_ID}${RT_SUFFIX}.log' </command>
475471
<jobname>${TEST_ID}${RT_SUFFIX}</jobname>
476472
<account>${ACCNR}</account>
477473
${ROCOTO_NODESIZE:+<nodesize>${ROCOTO_NODESIZE}</nodesize>}

tests/run_compile.sh

+8-1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ elif [[ ${SCHEDULER} = 'slurm' ]]; then
7373
fi
7474
fi
7575

76+
# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP
77+
if [[ "${JOB_SHOULD_FAIL:-NO}" == YES ]] ; then
78+
echo "The job should abort now, with exit status 1." 1>&2
79+
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
80+
false
81+
fi
82+
7683
################################################################################
7784
# Submit compile job
7885
################################################################################
@@ -81,7 +88,7 @@ if [[ ${ROCOTO} = 'false' ]]; then
8188
submit_and_wait job_card
8289
else
8390
chmod u+x job_card
84-
( ./job_card 2>&1 1>&3 3>&- | tee err || true ) 3>&1 1>&2 | tee out
91+
( ./job_card 2>&1 1>&3 3>&- | tee err ) 3>&1 1>&2 | tee out
8592
# The above shell redirection copies stdout to "out" and stderr to "err"
8693
# while still sending them to stdout and stderr. It does this without
8794
# relying on bash-specific extensions or non-standard OS features.

tests/run_test.sh

+4-6
Original file line numberDiff line numberDiff line change
@@ -372,9 +372,9 @@ if [[ ${SCHEDULER} = 'none' ]]; then
372372

373373
ulimit -s unlimited
374374
if [[ ${CI_TEST} = 'true' ]]; then
375-
eval "${OMP_ENV}" mpiexec -n "${TASKS}" ./fv3.exe >out 2> >(tee err >&3 || true)
375+
eval "${OMP_ENV}" mpiexec -n "${TASKS}" ./fv3.exe >out 2> >(tee err >&3)
376376
else
377-
mpiexec -n "${TASKS}" ./fv3.exe >out 2> >(tee err >&3 || true)
377+
mpiexec -n "${TASKS}" ./fv3.exe >out 2> >(tee err >&3)
378378
fi
379379

380380
else
@@ -383,7 +383,7 @@ else
383383
submit_and_wait job_card
384384
else
385385
chmod u+x job_card
386-
( ./job_card 2>&1 1>&3 3>&- | tee err || true ) 3>&1 1>&2 | tee out
386+
( ./job_card 2>&1 1>&3 3>&- | tee err ) 3>&1 1>&2 | tee out
387387
# The above shell redirection copies stdout to "out" and stderr to "err"
388388
# while still sending them to stdout and stderr. It does this without
389389
# relying on bash-specific extensions or non-standard OS features.
@@ -392,9 +392,7 @@ else
392392
fi
393393
skip_check_results=${skip_check_results:-false}
394394
if [[ ${skip_check_results} = false ]]; then
395-
check_results || true
396-
# The above call will exit with an error on its own and does
397-
# not need to cause run_test to TRAP the failure and error out itself.
395+
check_results
398396
else
399397
{
400398
echo

tests/tests/rrfs_v1beta_failing

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
2+
3+
# FIXME: THIS FILE SHOULD NOT BE MERGED TO DEVELOP
4+
5+
6+
###############################################################################
7+
#
8+
# RRFS v1beta variant that always fails at runtime
9+
#
10+
###############################################################################
11+
12+
export TEST_DESCR="Variant of RRFS_v1beta that always fails at runtime"
13+
14+
export CNTL_DIR=rrfs_v1beta_failing
15+
16+
export LIST_FILES="sfcf000.nc \
17+
sfcf009.nc \
18+
sfcf012.nc \
19+
atmf000.nc \
20+
atmf009.nc \
21+
atmf012.nc \
22+
GFSFLX.GrbF00 \
23+
GFSFLX.GrbF09 \
24+
GFSFLX.GrbF12 \
25+
GFSPRS.GrbF00 \
26+
GFSPRS.GrbF09 \
27+
GFSPRS.GrbF12 \
28+
RESTART/20210323.060000.coupler.res \
29+
RESTART/20210323.060000.fv_core.res.nc \
30+
RESTART/20210323.060000.fv_core.res.tile1.nc \
31+
RESTART/20210323.060000.fv_core.res.tile2.nc \
32+
RESTART/20210323.060000.fv_core.res.tile3.nc \
33+
RESTART/20210323.060000.fv_core.res.tile4.nc \
34+
RESTART/20210323.060000.fv_core.res.tile5.nc \
35+
RESTART/20210323.060000.fv_core.res.tile6.nc \
36+
RESTART/20210323.060000.fv_srf_wnd.res.tile1.nc \
37+
RESTART/20210323.060000.fv_srf_wnd.res.tile2.nc \
38+
RESTART/20210323.060000.fv_srf_wnd.res.tile3.nc \
39+
RESTART/20210323.060000.fv_srf_wnd.res.tile4.nc \
40+
RESTART/20210323.060000.fv_srf_wnd.res.tile5.nc \
41+
RESTART/20210323.060000.fv_srf_wnd.res.tile6.nc \
42+
RESTART/20210323.060000.fv_tracer.res.tile1.nc \
43+
RESTART/20210323.060000.fv_tracer.res.tile2.nc \
44+
RESTART/20210323.060000.fv_tracer.res.tile3.nc \
45+
RESTART/20210323.060000.fv_tracer.res.tile4.nc \
46+
RESTART/20210323.060000.fv_tracer.res.tile5.nc \
47+
RESTART/20210323.060000.fv_tracer.res.tile6.nc \
48+
RESTART/20210323.060000.phy_data.tile1.nc \
49+
RESTART/20210323.060000.phy_data.tile2.nc \
50+
RESTART/20210323.060000.phy_data.tile3.nc \
51+
RESTART/20210323.060000.phy_data.tile4.nc \
52+
RESTART/20210323.060000.phy_data.tile5.nc \
53+
RESTART/20210323.060000.phy_data.tile6.nc \
54+
RESTART/20210323.060000.sfc_data.tile1.nc \
55+
RESTART/20210323.060000.sfc_data.tile2.nc \
56+
RESTART/20210323.060000.sfc_data.tile3.nc \
57+
RESTART/20210323.060000.sfc_data.tile4.nc \
58+
RESTART/20210323.060000.sfc_data.tile5.nc \
59+
RESTART/20210323.060000.sfc_data.tile6.nc"
60+
61+
export_rrfs_v1
62+
export RESTART_INTERVAL="6 -1"
63+
export OUTPUT_FH='0 09 12'
64+
65+
# A special flag that tells the job to fail at runtime.
66+
export JOB_SHOULD_FAIL=YES
67+
68+
if [[ " hera orion hercules jet " =~ " ${MACHINE_ID} " ]] ; then
69+
ZSTANDARD_LEVEL=5
70+
fi

0 commit comments

Comments
 (0)