From a6d6d67efd46efda6160ed0c0e678b1549709e6c Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 1 Aug 2023 10:23:17 -0500 Subject: [PATCH 1/2] log sacct failures --- doc/changelog.rst | 3 +++ smartsim/_core/launcher/slurm/slurmLauncher.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index a6c720c0e..7a3aaae8c 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -19,14 +19,17 @@ To be released at some future point in time Description +- Log ignored error messages from `sacct` - Fix malformed logging format strings - Update linting support and apply to existing errors Detailed Notes +- Log errors reported from slurm WLM when attempts to retrieve status fail (PR331_) - Fix incorrectly formatted positional arguments in log format strings (PR330_) - Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks (PR311_) +.. _PR331: https://github.com/CrayLabs/SmartSim/pull/331 .. _PR330: https://github.com/CrayLabs/SmartSim/pull/330 .. _PR311: https://github.com/CrayLabs/SmartSim/pull/311 diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index dc1184dd4..d1f293155 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -224,7 +224,10 @@ def _get_slurm_step_id(step: Step, interval: int = 2) -> str: step_id: t.Optional[str] = None trials = CONFIG.wlm_trials while trials > 0: - output, _ = sacct(["--noheader", "-p", "--format=jobname,jobid"]) + output, err = sacct(["--noheader", "-p", "--format=jobname,jobid"]) + if err: + logger.warning(f"An error occurred while calling sacct: {err}") + step_id = parse_step_id_from_sacct(output, step.name) if step_id: break From 8fb60b888900ea8338d153db1ad4bbba651877ff Mon Sep 17 00:00:00 2001 From: Christopher McBride Date: Tue, 1 Aug 2023 15:31:13 -0400 Subject: [PATCH 2/2] add missing PR to desc in changelog --- doc/changelog.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 7a3aaae8c..cf43c1fae 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -25,10 +25,11 @@ Description Detailed Notes -- Log errors reported from slurm WLM when attempts to retrieve status fail (PR331_) +- Log errors reported from slurm WLM when attempts to retrieve status fail (PR331_, PR332_) - Fix incorrectly formatted positional arguments in log format strings (PR330_) - Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks (PR311_) +.. _PR332: https://github.com/CrayLabs/SmartSim/pull/332 .. _PR331: https://github.com/CrayLabs/SmartSim/pull/331 .. _PR330: https://github.com/CrayLabs/SmartSim/pull/330 .. _PR311: https://github.com/CrayLabs/SmartSim/pull/311