-
Notifications
You must be signed in to change notification settings - Fork 36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ray Cluster features #50
Changes from 124 commits
52c3ddc
34ca15c
4fbd5a1
fa36f4e
168b9fe
7d73ad5
f0aafa3
a65b710
0fb644f
ab2c6eb
a79d143
4c56512
c47c5ed
bead0b2
14f0197
c4723dc
6abb6fa
dc1f8f5
f55aae5
f0d4192
bd248b6
465995a
376e289
705436e
bbeb03a
f7fb211
397d125
a105693
a3bccfe
c4e0c80
732868d
6575a47
810f0a3
d46f73f
d0faeae
0d30c31
2a8088f
81f9a99
44a147f
4a7dd91
7d605e4
ed192d9
f2bfb82
dcf4827
3b57a62
932a563
5b4981b
87d6e29
d997cac
7731867
e22610a
22f7e39
f706f27
0aaa3b3
8a5f9c0
860f96f
3097a26
2b4ea74
4ecffc8
59fc8fa
b29db44
2275fce
47a8869
ada1709
4629f3d
ff6a95b
5bc9794
a5b44c9
85ba1a5
15e29fb
e25ada4
0cbe771
b1434d5
194c54b
c28320f
9260103
7fe95ef
b94be58
2120a84
a7334f0
6ee94f4
dfd0a85
2186b37
2546172
b56c669
677fab6
d3290ea
f460fa9
efd7747
af85bf0
93fcdcf
a958d22
b8d17ba
4fbe440
70a492c
5fd61e8
11934a7
0b4321f
b857157
2570cc3
733deb2
217c28f
4cf8163
19c1a2c
d8d7a37
e3db4db
09ee09e
aa8ad2e
8d7fe10
0ca9c0c
b1628ba
3e884f3
6fc8281
968a95a
2993b7d
191288a
7330d3c
a239a5a
7674d01
086cb91
84f4069
a6fb7f2
b694c25
be4f7e2
fb3d899
2b06c1c
4b325b7
0aec403
ce03a9e
5bd5b91
149edaf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ __pycache__ | |
.pytest_cache/ | ||
.coverage* | ||
htmlcov | ||
smartsim.egg-info | ||
|
||
# Dependencies | ||
third-party | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,7 +41,8 @@ | |
'sphinxfortran.fortran_domain', | ||
'sphinxfortran.fortran_autodoc', | ||
'breathe', | ||
'nbsphinx' | ||
'nbsphinx', | ||
'myst_parser' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dont need this |
||
] | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,6 +66,8 @@ doc= | |
sphinx-fortran==1.1.1 | ||
nbsphinx>=0.8.2 | ||
|
||
ray= | ||
ray==1.6 | ||
|
||
[options.packages.find] | ||
exclude = | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -247,9 +247,11 @@ def init_launcher(self, launcher): | |
elif launcher == "pbs": | ||
self._launcher = PBSLauncher() | ||
self._jobs.set_launcher(self._launcher) | ||
# Init Cobalt launcher | ||
elif launcher == "cobalt": | ||
self._launcher = CobaltLauncher() | ||
self._jobs.set_launcher(self._launcher) | ||
# Init LSF launcher | ||
elif launcher == "lsf": | ||
self._launcher = LSFLauncher() | ||
self._jobs.set_launcher(self._launcher) | ||
|
@@ -275,6 +277,9 @@ def _launch(self, manifest): | |
raise SmartSimError(msg) | ||
self._launch_orchestrator(orchestrator) | ||
|
||
for rc in manifest.ray_clusters: | ||
self._launch_ray_cluster(rc) | ||
|
||
# create all steps prior to launch | ||
steps = [] | ||
|
||
|
@@ -345,6 +350,31 @@ def _launch_orchestrator(self, orchestrator): | |
self._save_orchestrator(orchestrator) | ||
logger.debug(f"Orchestrator launched on nodes: {orchestrator.hosts}") | ||
|
||
def _launch_ray_cluster(self, ray_cluster): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could this be the normal launching function? |
||
"""Launch a Ray Cluster instance | ||
This function will launch the Ray Cluster instance and | ||
if on WLM, find the nodes where it was launched and | ||
set them in the JobManager | ||
|
||
:param orchestrator: ray cluster to launch | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
:type orchestrator: RayCluster | ||
""" | ||
# if the Ray cluster was launched as a batch workload | ||
ray_cluster._update_workers() | ||
|
||
if ray_cluster.batch: | ||
ray_batch_step = self._create_batch_job_step(ray_cluster) | ||
self._launch_step(ray_batch_step, ray_cluster) | ||
|
||
else: | ||
ray_steps = [ | ||
(self._create_job_step(ray_node), ray_node) for ray_node in ray_cluster | ||
] | ||
for ray_step in ray_steps: | ||
self._launch_step(*ray_step) | ||
|
||
logger.info("Ray cluster launched.") | ||
|
||
def _launch_step(self, job_step, entity): | ||
"""Use the launcher to launch a job stop | ||
|
||
|
@@ -498,7 +528,7 @@ def _orchestrator_launch_wait(self, orchestrator): | |
if not self._jobs.actively_monitoring: | ||
self._jobs.check_jobs() | ||
|
||
# _jobs.get_status aquires JM lock for main thread, no need for locking | ||
# _jobs.get_status acquires JM lock for main thread, no need for locking | ||
statuses = self.get_entity_list_status(orchestrator) | ||
if all([stat == STATUS_RUNNING for stat in statuses]): | ||
ready = True | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .raycluster import RayCluster, parse_ray_head_node_address |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
put ray with
full_wlm