horovod · tgaddair · Jan 19, 2021 · Jan 14, 2021 · Jan 14, 2021 · Jan 16, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,14 +8,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-### Changed 
+### Changed
 
 ### Deprecated
 
 ### Removed
 
 ### Fixed
 
+- Fixed `local_rank` support for Ray. ([#2596](https://github.com/horovod/horovod/pull/2596))
+
 ## [v0.21.1] - 2021-01-06
 
 ### Added
@@ -32,9 +34,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 - Added knob to set cache hint for oneCCL allreduce. ([#2560](https://github.com/horovod/horovod/pull/2560))
 
-- Renamed `horovodrun` arg `--ccl-bgt-affinity` to `--thread-affinity`. ([#2562](https://github.com/horovod/horovod/pull/2562)) 
+- Renamed `horovodrun` arg `--ccl-bgt-affinity` to `--thread-affinity`. ([#2562](https://github.com/horovod/horovod/pull/2562))
 
-- Changed default build parallelism from `-j8` to `-j1` to address potential race condition. ([#2572](https://github.com/horovod/horovod/pull/2572)) 
+- Changed default build parallelism from `-j8` to `-j1` to address potential race condition. ([#2572](https://github.com/horovod/horovod/pull/2572))
 
 ### Fixed
 

diff --git a/horovod/ray/runner.py b/horovod/ray/runner.py
@@ -206,10 +206,10 @@ def finalize_registration(self) -> dict:
                 self.hostnames_by_rank.items()):
             for local_rank, world_rank in enumerate(ranks):
                 rank_to_info[world_rank] = dict(
-                    NODE_WORLD_RANK=node_world_rank,
-                    NODE_WORLD_SIZE=len(self.hostnames_by_rank),
-                    LOCAL_RANK=local_rank,
-                    LOCAL_SIZE=len(ranks))
+                    HOROVOD_CROSS_RANK=node_world_rank,
+                    HOROVOD_CROSS_SIZE=len(self.hostnames_by_rank),
+                    HOROVOD_LOCAL_RANK=local_rank,
+                    HOROVOD_LOCAL_SIZE=len(ranks))
         return rank_to_info
 
     def establish_rendezvous(self) -> Dict[str, str]:

diff --git a/test/single/test_ray.py b/test/single/test_ray.py
@@ -77,11 +77,13 @@ def test_coordinator_registration():
 
     rank_to_info = coord.finalize_registration()
     assert len(rank_to_info) == len(ranks)
-    assert all(info["NODE_WORLD_SIZE"] == 3 for info in rank_to_info.values())
-    assert {info["NODE_WORLD_RANK"]
+    assert all(
+        info["HOROVOD_CROSS_SIZE"] == 3 for info in rank_to_info.values())
+    assert {info["HOROVOD_CROSS_RANK"]
             for info in rank_to_info.values()} == {0, 1, 2}
-    assert all(info["LOCAL_SIZE"] == 4 for info in rank_to_info.values())
-    assert {info["LOCAL_RANK"]
+    assert all(
+        info["HOROVOD_LOCAL_SIZE"] == 4 for info in rank_to_info.values())
+    assert {info["HOROVOD_LOCAL_RANK"]
             for info in rank_to_info.values()} == {0, 1, 2, 3}
 
 
@@ -243,21 +245,22 @@ def benchmark_step():
         optimizer.step()
 
     time = timeit.timeit(benchmark_step, number=batch_per_iter)
+    return hvd.local_rank()
 
 
 @pytest.mark.skipif(
     not gloo_built(), reason='Gloo is required for Ray integration')
 def test_horovod_train(ray_start_4_cpus):
     def simple_fn(worker):
-        _train()
-        return True
+        local_rank = _train()
+        return local_rank
 
     setting = RayExecutor.create_settings(timeout_s=30)
     hjob = RayExecutor(
         setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
     hjob.start()
     result = hjob.execute(simple_fn)
-    assert all(result)
+    assert set(result) == {0, 1, 2, 3}
     hjob.shutdown()