From 86dbd19f69ff9e0065eca74e4e34920361724d8a Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Tue, 19 Jan 2021 09:18:36 -0800 Subject: [PATCH] [ray] fix local_rank issue (#2596) --- CHANGELOG.md | 8 +++++--- horovod/ray/runner.py | 8 ++++---- test/single/test_ray.py | 17 ++++++++++------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 937732a9ed..788549ac1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added -### Changed +### Changed ### Deprecated @@ -16,6 +16,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed `local_rank` support for Ray. ([#2596](https://github.com/horovod/horovod/pull/2596)) + ## [v0.21.1] - 2021-01-06 ### Added @@ -32,9 +34,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Added knob to set cache hint for oneCCL allreduce. ([#2560](https://github.com/horovod/horovod/pull/2560)) -- Renamed `horovodrun` arg `--ccl-bgt-affinity` to `--thread-affinity`. ([#2562](https://github.com/horovod/horovod/pull/2562)) +- Renamed `horovodrun` arg `--ccl-bgt-affinity` to `--thread-affinity`. ([#2562](https://github.com/horovod/horovod/pull/2562)) -- Changed default build parallelism from `-j8` to `-j1` to address potential race condition. ([#2572](https://github.com/horovod/horovod/pull/2572)) +- Changed default build parallelism from `-j8` to `-j1` to address potential race condition. ([#2572](https://github.com/horovod/horovod/pull/2572)) ### Fixed diff --git a/horovod/ray/runner.py b/horovod/ray/runner.py index 04642f3a4b..cddefb5919 100644 --- a/horovod/ray/runner.py +++ b/horovod/ray/runner.py @@ -206,10 +206,10 @@ def finalize_registration(self) -> dict: self.hostnames_by_rank.items()): for local_rank, world_rank in enumerate(ranks): rank_to_info[world_rank] = dict( - NODE_WORLD_RANK=node_world_rank, - NODE_WORLD_SIZE=len(self.hostnames_by_rank), - LOCAL_RANK=local_rank, - LOCAL_SIZE=len(ranks)) + HOROVOD_CROSS_RANK=node_world_rank, + HOROVOD_CROSS_SIZE=len(self.hostnames_by_rank), + HOROVOD_LOCAL_RANK=local_rank, + HOROVOD_LOCAL_SIZE=len(ranks)) return rank_to_info def establish_rendezvous(self) -> Dict[str, str]: diff --git a/test/single/test_ray.py b/test/single/test_ray.py index 314f413b5e..9b1dc3e386 100644 --- a/test/single/test_ray.py +++ b/test/single/test_ray.py @@ -77,11 +77,13 @@ def test_coordinator_registration(): rank_to_info = coord.finalize_registration() assert len(rank_to_info) == len(ranks) - assert all(info["NODE_WORLD_SIZE"] == 3 for info in rank_to_info.values()) - assert {info["NODE_WORLD_RANK"] + assert all( + info["HOROVOD_CROSS_SIZE"] == 3 for info in rank_to_info.values()) + assert {info["HOROVOD_CROSS_RANK"] for info in rank_to_info.values()} == {0, 1, 2} - assert all(info["LOCAL_SIZE"] == 4 for info in rank_to_info.values()) - assert {info["LOCAL_RANK"] + assert all( + info["HOROVOD_LOCAL_SIZE"] == 4 for info in rank_to_info.values()) + assert {info["HOROVOD_LOCAL_RANK"] for info in rank_to_info.values()} == {0, 1, 2, 3} @@ -243,21 +245,22 @@ def benchmark_step(): optimizer.step() time = timeit.timeit(benchmark_step, number=batch_per_iter) + return hvd.local_rank() @pytest.mark.skipif( not gloo_built(), reason='Gloo is required for Ray integration') def test_horovod_train(ray_start_4_cpus): def simple_fn(worker): - _train() - return True + local_rank = _train() + return local_rank setting = RayExecutor.create_settings(timeout_s=30) hjob = RayExecutor( setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available()) hjob.start() result = hjob.execute(simple_fn) - assert all(result) + assert set(result) == {0, 1, 2, 3} hjob.shutdown()