Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SlurmScheduler: Parse the NODE_FAIL state #5866

Merged
merged 1 commit into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions aiida/engine/processes/calcjobs/calcjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,9 @@ def define(cls, spec: CalcJobProcessSpec) -> None: # type: ignore[override]
spec.exit_code(
131, 'ERROR_SCHEDULER_INVALID_ACCOUNT', invalidates_cache=True, message='The specified account is invalid.'
)
spec.exit_code(
140, 'ERROR_SCHEDULER_NODE_FAILURE', invalidates_cache=True, message='The node running the job failed.'
)
spec.exit_code(150, 'STOPPED_BY_MONITOR', invalidates_cache=True, message='{message}')

@classproperty
Expand Down
3 changes: 3 additions & 0 deletions aiida/schedulers/plugins/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,9 @@ def parse_output(self, detailed_job_info=None, stdout=None, stderr=None):
if data['State'] == 'TIMEOUT':
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME

if data['State'] == 'NODE_FAIL':
return CalcJob.exit_codes.ERROR_SCHEDULER_NODE_FAILURE

# Alternatively, if the ``detailed_job_info`` is not defined or hasn't already determined an error, try to match
# known error messages from the output written to the ``stderr`` descriptor.
if stderr is not None:
Expand Down
14 changes: 14 additions & 0 deletions tests/schedulers/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,20 @@ def test_parse_out_of_memory():
assert exit_code == CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY # pylint: disable=no-member


def test_parse_node_failure():
"""Test that `ERROR_SCHEDULER_NODE_FAILURE` code is returned if `STATE == NODE_FAIL`."""
scheduler = SlurmScheduler()
detailed_job_info = {
'retval': 0,
'stderr': '',
'stdout': """||||||||||||||||||||||||||||||||||||||||||||||||||
|||||||||||||||||||||||||||||||||||||||||NODE_FAIL|||||||||"""
} # yapf: disable

exit_code = scheduler.parse_output(detailed_job_info, '', '')
assert exit_code == CalcJob.exit_codes.ERROR_SCHEDULER_NODE_FAILURE # pylint: disable=no-member


@pytest.mark.parametrize('detailed_job_info, expected', [
('string', TypeError), # Not a dictionary
({'stderr': ''}, ValueError), # Key `stdout` missing
Expand Down