-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathqscript.template
195 lines (193 loc) · 9.76 KB
/
qscript.template
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/bin/bash
#----------------------------------------------------------------------------
# Q U E U E S Y S T E M D I R E C T I V E S
#----------------------------------------------------------------------------
#PBS -N %jobtype%.%scenario%
#PBS -l walltime=%walltime%
#PBS -l nodes=%nnodes%:ppn=%ppn%
#PBS -q %queuename%
#PBS -A %account%
#PBS -o %advisdir%/%scenario%/%jobtype%.out
#PBS -V
#PBS -j oe
#PBS -m a
#PBS -M %notifyuser%
#SBATCH --job-name="%jobtype%.%scenario%"
#SBATCH --time=%walltime%
#SBATCH --ntasks-per-node=%ppn%
#SBATCH --ntasks=%totalcpu%
#SBATCH --nodes=%nnodes%
#SBATCH --partition=%queuename%
#SBATCH --reservation=%reservation%
#SBATCH --constraint=%constraint%
#SBATCH --account=%account%
#SBATCH --qos=%qos%
#SBATCH --output=%advisdir%/%scenario%/%jobtype%.out
#SBATCH --mail-type=FAIL,TIME_LIMIT
#SBATCH --mail-user=%notifyuser%
echo "------------------------------------------------------------------------"
#
#----------------------------------------------------------------------------
# I N I T I A L I Z E D I R E C T O R Y A N D F I L E N A M E S
#----------------------------------------------------------------------------
THIS=%jobtype%.%queuesyslc% # name of this script for use in log messages
SCRIPTDIR=%scriptdir%
SYSLOG=%syslog%
CYCLEDIR=%advisdir%
CYCLE=$(basename %advisdir%)
CYCLELOG=$CYCLEDIR/cycle.log
SCENARIO=%scenario%
SCENARIODIR=$CYCLEDIR/$SCENARIO
SCENARIOLOG=$SCENARIODIR/scenario.log
#
cd $SCENARIODIR 2> >(awk -v this=$THIS -v level=ERROR -f $SCRIPTDIR/monitoring/timestamp.awk | tee -a $SYSLOG | tee -a $CYCLELOG | tee -a $SCENARIOLOG )
#
#----------------------------------------------------------------------------
# L O G M E S S A G E S T O S T A R T T H E J O B
#----------------------------------------------------------------------------
echo "Starting in $SCENARIODIR with %queuesys% Job ID ${%JOBID%}; %queuesys% submit directory ${%JOBDIR%}; and %queuesys% submit host ${%JOBHOST%}." 2>&1 | awk -v this=$THIS -v level=INFO -f $SCRIPTDIR/monitoring/timestamp.awk | tee --append $SCENARIOLOG | tee --append $CYCLELOG | tee --append $SYSLOG
# record which cluster nodes we have to scenario.log
echo "INFO: $THIS: %JOBNODES%: $%JOBNODES%" #QUEUEONLY
echo "INFO: $THIS: hostname: "$(hostname)
echo "INFO: $THIS: PATH : $PATH"
echo "INFO: $THIS: LD_LIBRARY_PATH : $LD_LIBRARY_PATH"
#
#----------------------------------------------------------------------------
# W R I T E J O B P R O P E R T I E S
#----------------------------------------------------------------------------
#
if [[ -e "%adcircdir%/../adcirc.bin.buildinfo.json" && ! -e "adcirc.bin.buildinfo.json" ]]; then
cp "%adcircdir%/../adcirc.bin.buildinfo.json" . 2>> $SYSLOG
echo "adcirc.file.metadata.build : adcirc.bin.buildinfo.json" >> run.properties
fi
echo "time.hpc.job.%jobtype%.start : $(date +'%Y-%h-%d-T%H:%M:%S%z')" >> $SCENARIO.run.properties
echo "hpc.job.%jobtype%.jobid : ${%JOBID%}" >> $SCENARIO.run.properties
echo "hpc.job.%jobtype%.nodelist : ( %JOBNODES% )" >> $SCENARIO.run.properties #QUEUEONLY
echo "hpc.job.%jobtype%.hostname : $HOSTNAME" >> $SCENARIO.run.properties #QUEUEONLY
echo "hpc.job.%jobtype%.qnnodes : $%JOBNNODES%" >> $SCENARIO.run.properties #QUEUEONLY
echo "hpc.job.%jobtype%.qntasks-per-node : $%JOBNTASKSPERNODE%" >> $SCENARIO.run.properties #QUEUEONLY
echo "hpc.job.%jobtype%.qntasks : $%JOBNTASKS%" >> $SCENARIO.run.properties #QUEUEONLY
echo "hpc.job.%jobtype%.joblog : %advisdir%/%scenario%/%jobtype%.out" >> $SCENARIO.run.properties #QUEUEONLY
#
#----------------------------------------------------------------------------
# L A Y E R S P E C I F I C A T I O N
#----------------------------------------------------------------------------
#
jobtype="%jobtype%"
createWind10mLayer="%wind10mlayer%"
declare -a layers=( $SCENARIO )
if [[ $createWind10mLayer == "yes" && $jobtype != "partmesh" && $jobtype != "prep20" && $jobtype != "prep13" ]]; then
layers=( wind10m ${layers[@]} )
fi
#
#----------------------------------------------------------------------------
# L O O P O V E R L A Y E R S
#----------------------------------------------------------------------------
#
for layer in ${layers[@]}; do
if [[ -e "fort.15" ]]; then
rm fort.15 2>> $SCENARIOLOG
fi
ln -s ${layer}.fort.15 fort.15 2>> $SCENARIOLOG
# if this is not a prep job, untar the corresponding fort.15 files
# and link to the proper meteorological output files
if [[ ${jobtype:0:4} != "prep" && $jobtype != "partmesh" ]]; then
# unpack subdomain fort.15 files for use in generating the layer
tar xvf ${layer}.fort.15.tar 2>> $SCENARIOLOG 2>&1
# link to empty meteorological output files created in prep phase
for layerFile in $(ls ${layer}.fort.7* ${layer}.maxwvel* ${layer}.minpr*); do
if [[ -e ${layerFile#$layer.} ]]; then
rm ${layerFile#$layer.} 2>> $SCENARIOLOG
fi
ln -s $layerFile ${layerFile#$layer.} 2>> $SCENARIOLOG
done
fi
#
# E X E C U T E T H E J O B
#
CMD="%cmd%"
if [[ $layer == "wind10m" || $SCENARIO == *"Wind10m" ]]; then
# the met only mode should not be run with the SWAN wave coupled executable
CMD=$(echo "$CMD" | sed 's?adcswan ?adcirc ?')
fi
echo "cycle $CYCLE: $SCENARIO: $jobtype.$layer job ${%JOBID%} starting in $SCENARIODIR with the following command: $CMD" 2>&1 | awk -v level=INFO -v this=$THIS -f $SCRIPTDIR/monitoring/timestamp.awk | tee --append $SCENARIOLOG | tee --append $CYCLELOG | tee --append $SYSLOG
echo "\"start\" : \"$(date +'%Y-%h-%d-T%H:%M:%S%z')\", \"jobid\" : \"${%JOBID%}\"" > $jobtype.${layer}.run.start # <-OVERWRITE
echo "\"jobtype\" : \"$jobtype.$layer\", \"submit\" : null, \"jobid\" : \"${%JOBID%}\", \"start\" : \"$(date +'%Y-%h-%d-T%H:%M:%S%z')\", \"finish\" : null, \"error\" : null" >> jobs.status
$CMD
ERROVALUE=$? # capture exit status
#
# C H E C K S T A T U S A N D Q U A L I T Y O F R E S U L T S
#
ERROMSG=""
RUNSUFFIX="finish"
qualityCheck=0
if [ $ERROVALUE -eq 0 ] ; then
if [[ ${jobtype:0:4} != "prep" && $jobtype != "partmesh" ]]; then
# look for numerical instability errors in the stdout/stderr files
for file in adcirc.log $SCENARIOLOG ; do
if [ -e $file ]; then
numMsg=$(grep WarnElev $file | wc -l)
if [ $numMsg = 0 ]; then
echo "cycle $CYCLE: $SCENARIO: No numerical instability detected in '$file' after completion of '$jobtype.$layer' job '${%JOBID%}'." 2>&1 | awk -v level=INFO -v this=$THIS -f $SCRIPTDIR/monitoring/timestamp.awk | tee --append $SCENARIOLOG
else
ERROVALUE=1
ERROMSG="$ERROMSG Detected '$numMsg' numerical instability messages in '$file' after completion of '$jobtype.$layer' job '${%JOBID%}'. "
RUNSUFFIX="error"
fi
fi
done
# check quality of results
$SCRIPTDIR/util/output/quality.sh $jobtype $layer ${%JOBID%} $CYCLE $SCENARIO $SCRIPTDIR >> $SCENARIOLOG # padcirc nowcast 555 2024072406 nowcast $SCRIPTDIR
qualityCheck=$?
if [[ $qualityCheck -ne 0 ]]; then
ERROMSG+=$(<jobFailed)
fi
fi
else
ERROMSG="$ERROMSG The $jobtype.$layer job ended with an exit status that indicates an error occurred. "
RUNSUFFIX="error"
fi
#
echo "cycle $CYCLE: $SCENARIO: $jobtype.$layer job ID '${%JOBID%}' finished in $SCENARIODIR with return value = $ERROVALUE" 2>&1 | awk -v level=INFO -v this=$THIS -f $SCRIPTDIR/monitoring/timestamp.awk | tee --append $SCENARIOLOG | tee --append $CYCLELOG | tee --append $SYSLOG
#
# R E P O R T J O B S T A T U S
#
if [[ $ERROVALUE -ne 0 || $qualityCheck -ne 0 ]]; then
echo "cycle $CYCLE: $SCENARIO: $THIS: $ERROMSG" 2>&1 | awk -v this=$THIS -v level=ERROR -f $SCRIPTDIR/monitoring/timestamp.awk | tee --append $SCENARIOLOG | tee --append $CYCLELOG | tee --append $SYSLOG
echo "\"$RUNSUFFIX\" : \"$(date +'%Y-%h-%d-T%H:%M:%S%z')\", \"message\" : \"$ERROMSG\"" > $jobtype.${layer}.run.${RUNSUFFIX} # <-OVERWRITE
echo "\"jobtype\" : \"$jobtype.$layer\", \"submit\" : null, \"jobid\" : \"${%JOBID%}\", \"start\" : null, \"finish\" : null, \"error\" : \"$(date +'%Y-%h-%d-T%H:%M:%S%z')\"" >> jobs.status
else
echo "\"jobtype\" : \"$jobtype.$layer\", \"submit\" : null, \"jobid\" : \"${%JOBID%}\", \"start\" : null, \"finish\" : \"$(date +'%Y-%h-%d-T%H:%M:%S%z')\", \"error\" : null" >> jobs.status
echo "\"$RUNSUFFIX\" : \"$(date +'%Y-%h-%d-T%H:%M:%S%z')\"" > $jobtype.${layer}.run.$RUNSUFFIX
fi
echo "time.hpc.job.$jobtype.$layer.${RUNSUFFIX} : $(date +'%Y-%h-%d-T%H:%M:%S%z')" >> $SCENARIO.run.properties
echo "-----------------------------------------------------------------------"
#
# if this is a prepall or prep15 job, tar up the subdomain fort.15 files
# and copy the meteorological output files to their own filenames
#
if [[ $jobtype == "prepall" || $jobtype == "prep15" ]]; then
tar cvf ${layer}.fort.15.tar PE*/fort.15 2>>$SYSLOG 2>&1
rm PE*/fort.15 2>> $SYSLOG
for file in $(ls fort.7* maxwvel* minpr*); do
mv $file ${layer}.$file
done
fi
done
#
# finalize by setting the symbolic links for the fort.15 file and
# the meteorology output files to the full output
# (i.e., those that include wind reduction)
if [[ -e "fort.15" ]]; then
rm fort.15 2>> $SCENARIOLOG
fi
ln -s $SCENARIO.fort.15 fort.15 2>> $SCENARIOLOG
#
for file in $(ls fort.7* maxwvel* minpr*); do
if [[ -e $file ]]; then
rm $file 2>> $SCENARIOLOG
fi
done
for file in $(ls ${SCENARIO}.fort.7* ${SCENARIO}.maxwvel* ${SCENARIO}.minpr*); do
ln -s $file ${file#$SCENARIO.} 2>> $SCENARIOLOG
done