Skip to content

Commit

Permalink
Simplify the logic in QsubBatchSettings
Browse files Browse the repository at this point in the history
Much of the need to check the type and value of the the nodes
property in QsubBatchSettings is because there are two technically
valid, but not quite equivalent ways of setting the number of
nodes. Now, we check at various points that the both 'select' and
'nodes' is not set. Additionally, both routes can be used to set
the internal _nodes property if it needs to be accessed within
Python
  • Loading branch information
ashao committed Dec 7, 2023
1 parent 31520a9 commit 973af2c
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 64 deletions.
21 changes: 1 addition & 20 deletions smartsim/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
class SettingsBase:
...


# pylint: disable=too-many-public-methods
class RunSettings(SettingsBase):
# pylint: disable=unused-argument
Expand Down Expand Up @@ -592,26 +593,6 @@ def __init__(
self.set_queue(kwargs.get("queue", None))
self.set_account(kwargs.get("account", None))

@property
def nodes(self) -> t.Optional[int]:
return self._nodes


@nodes.setter
def nodes(self, num_nodes: t.Optional[t.Union[int, str]]) -> None:
if num_nodes:
if isinstance(num_nodes, int):
self._nodes = num_nodes
elif isinstance(num_nodes, str):
self._nodes = int(num_nodes)
else:
raise TypeError(
"Nodes must be an int or a string interpretable as an int"
)
else:
self._nodes = None


@property
def batch_cmd(self) -> str:
"""Return the batch command
Expand Down
102 changes: 58 additions & 44 deletions smartsim/settings/pbsSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
import typing as t

from .._core.utils import init_default
from ..error import SmartSimError
from .base import BatchSettings
from ..error import SSConfigError
from ..log import get_logger
from .base import BatchSettings

logger = get_logger(__name__)

Expand Down Expand Up @@ -72,13 +72,7 @@ def __init__(
self._time: t.Optional[str] = None
self._nodes: t.Optional[int] = None
self._ncpus = ncpus

if resources and "nodes" in resources and nodes is not None:
if nodes != resources["nodes"]:
raise ValueError(
"nodes was specified as a kwarg and also in the resources "
f"but are not the same value: {nodes=} {resources['nodes']=}"
)
self.resources = init_default({}, resources, dict)

# time, queue, nodes, and account set in parent class init
super().__init__(
Expand All @@ -90,21 +84,33 @@ def __init__(
time=time,
**kwargs,
)
self.resources = init_default({}, resources, dict)
self._hosts: t.List[str] = []

self._sanity_check_resources()
# Set the number of nodes if it was specified, note this needs
# to be done after the super init because nodes might also be set
self._nodes = self.resources.get("nodes", None) or self.resources.get(
"select", None
)

self._hosts: t.List[str] = []

def set_nodes(self, num_nodes: int) -> None:
"""Set the number of nodes for this batch job
If a select argument is provided in ``QsubBatchSettings.resources``
this value will be overridden
In PBS, 'select' is the more primitive way of describing how
many nodes to allocate for the job. 'nodes' is equivalent to
'select' with a 'place' statement. Assuming that only advanced
users would use 'set_resource' instead, defining the number of
nodes here is sets the 'nodes' resource.
:param num_nodes: number of nodes
:type num_nodes: int
"""

if num_nodes:
self._nodes = num_nodes
self.set_resource("nodes", self._nodes)
self._sanity_check_resources()

def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None:
"""Specify the hostlist for this job
Expand Down Expand Up @@ -180,6 +186,11 @@ def set_resource(self, resource_name: str, value: str) -> None:
# TODO add error checking here
# TODO include option to overwrite place (warning for orchestrator?)
self.resources[resource_name] = value
self._sanity_check_resources()
# Capture the case where someone is setting the number of nodes
# through 'select' or 'nodes'
if resource_name in ["select", "nodes"] and value:
self._nodes = int(value)

def format_batch_args(self) -> t.List[str]:
"""Get the formatted batch arguments for a preview
Expand All @@ -196,40 +207,43 @@ def format_batch_args(self) -> t.List[str]:
opts += [" ".join((prefix + opt, str(value)))]
return opts

def _sanity_check_resources(self) -> None:
"""Check that only select or nodes was specified in resources
Note: For PBS Pro, nodes is equivalent to 'select' and 'place' so
they are not quite synonyms. Here we assume that
"""

has_select = self.resources.get("select", None)
has_nodes = self.resources.get("nodes", None)

if has_select and has_nodes:
raise SSConfigError(
"'select' and 'nodes' cannot both be specified. This can happen "
"if nodes were specified using the 'set_nodes' method and"
"'select' was set using 'set_resource'. Please only specify one."
)

def _create_resource_list(self) -> t.List[str]:

self._sanity_check_resources()
res = []

# get select statement from resources or kwargs
if ("select" in self.resources) and "nodes" not in self.resources:
res += [f"-l select={str(self.resources['select'])}"]
elif ("select" in self.resources) and ("nodes" in self.resources):
nselect = self.resources["select"]
if nselect == self._nodes:
logger.warning("select and nodes were both specified, specifying nodes")
res += [f"-l nodes={self._nodes}"]
else:
raise SmartSimError(
(
"select and nodes were both specified, but do not have "
f"the same value. select={nselect} nodes={self._nodes}"
)
)
elif "nodes" in self.resources:
res += [f"-l nodes={self._nodes}"]
# Construct the basic select/nodes statement
if self.resources.get("select", None):
select_command = f"-l select={self.resources['select']}"
elif self.resources.get("nodes", None):
select_command = f"-l nodes={self.resources['nodes']}"
else:
select = "-l select="
if self._nodes:
select += str(self._nodes)
else:
raise SmartSimError(
"Insufficient resource specification: no nodes or select statement"
)
if self._ncpus:
select += f":ncpus={self._ncpus}"
if self._hosts:
hosts = ["=".join(("host", str(host))) for host in self._hosts]
select += f":{'+'.join(hosts)}"
res += [select]
raise SSConfigError(
"Insufficient resource specification: no nodes or select statement"
)
if self._ncpus:
select_command += f":ncpus={self._ncpus}"
if self._hosts:
hosts = ["=".join(("host", str(host))) for host in self._hosts]
select_command += f":{'+'.join(hosts)}"
res += [select_command]

if "place" in self.resources:
res += [f"-l place={str(self.resources['place'])}"]
Expand All @@ -242,6 +256,6 @@ def _create_resource_list(self) -> t.List[str]:
res += [f"-l walltime={self._time}"]

for resource, value in self.resources.items():
if resource not in ["select", "walltime", "place"]:
if resource not in ["nodes", "select", "walltime", "place"]:
res += [f"-l {resource}={str(value)}"]
return res
91 changes: 91 additions & 0 deletions tests/test_pbs_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# BSD 2-Clause License
#
# Copyright (c) 2021-2023, Hewlett Packard Enterprise
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import pytest

from smartsim.error import SSConfigError
from smartsim.settings import QsubBatchSettings

# The tests in this file belong to the group_b group
pytestmark = pytest.mark.group_b


def test_node_formatting():
def validate_settings(settings, spec, num_nodes, num_cpus):
assert settings._create_resource_list() == [
f"-l {spec}={num_nodes}:ncpus={num_cpus}"
]
assert settings._ncpus == num_cpus
assert settings._nodes == num_nodes

num_nodes = 10
num_cpus = 36

# Test by specifying the number of nodes via setting a resource
for spec in ["nodes", "select"]:
# Test by setting nodes
settings = QsubBatchSettings()
settings.set_resource(spec, num_nodes)
settings.set_ncpus(36)
validate_settings(settings, spec, num_nodes, num_cpus)

# Test when setting nodes through the constructor
settings = QsubBatchSettings(ncpus=num_cpus, nodes=num_nodes)
validate_settings(settings, "nodes", num_nodes, num_cpus)

# Test when setting nodes through the constructor via resource
settings = QsubBatchSettings(ncpus=num_cpus, resources={"nodes": num_nodes})
validate_settings(settings, "nodes", num_nodes, num_cpus)

# Test when setting select through the constructor via resource
settings = QsubBatchSettings(ncpus=num_cpus, resources={"select": num_nodes})
validate_settings(settings, "select", num_nodes, num_cpus)


def test_select_nodes_error():

# # Test failure on initialization
with pytest.raises(SSConfigError):
QsubBatchSettings(nodes=10, resources={"select": 10})

# Test setting via nodes and then select
settings = QsubBatchSettings()
settings.set_nodes(10)
with pytest.raises(SSConfigError):
settings.set_resource("select", 10)

# Manually put "select" in the resource dictionary and
# make sure the resource formatter catches the error
settings = QsubBatchSettings()
settings.resources = {"nodes": 10, "select": 20}
with pytest.raises(SSConfigError):
settings._create_resource_list()

# # Test setting via select and then nodes
settings = QsubBatchSettings()
settings.set_resource("select", 10)
with pytest.raises(SSConfigError):
settings.set_nodes(10)

0 comments on commit 973af2c

Please sign in to comment.