From 86dbd19f69ff9e0065eca74e4e34920361724d8a Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Tue, 19 Jan 2021 09:18:36 -0800
Subject: [PATCH] [ray] fix local_rank issue (#2596)

---
 CHANGELOG.md            |  8 +++++---
 horovod/ray/runner.py   |  8 ++++----
 test/single/test_ray.py | 17 ++++++++++-------
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 937732a9ed..788549ac1b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-### Changed 
+### Changed
 
 ### Deprecated
 
@@ -16,6 +16,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed `local_rank` support for Ray. ([#2596](https://github.com/horovod/horovod/pull/2596))
+
 ## [v0.21.1] - 2021-01-06
 
 ### Added
@@ -32,9 +34,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 - Added knob to set cache hint for oneCCL allreduce. ([#2560](https://github.com/horovod/horovod/pull/2560))
 
-- Renamed `horovodrun` arg `--ccl-bgt-affinity` to `--thread-affinity`. ([#2562](https://github.com/horovod/horovod/pull/2562)) 
+- Renamed `horovodrun` arg `--ccl-bgt-affinity` to `--thread-affinity`. ([#2562](https://github.com/horovod/horovod/pull/2562))
 
-- Changed default build parallelism from `-j8` to `-j1` to address potential race condition. ([#2572](https://github.com/horovod/horovod/pull/2572)) 
+- Changed default build parallelism from `-j8` to `-j1` to address potential race condition. ([#2572](https://github.com/horovod/horovod/pull/2572))
 
 ### Fixed
 
diff --git a/horovod/ray/runner.py b/horovod/ray/runner.py
index 04642f3a4b..cddefb5919 100644
--- a/horovod/ray/runner.py
+++ b/horovod/ray/runner.py
@@ -206,10 +206,10 @@ def finalize_registration(self) -> dict:
                 self.hostnames_by_rank.items()):
             for local_rank, world_rank in enumerate(ranks):
                 rank_to_info[world_rank] = dict(
-                    NODE_WORLD_RANK=node_world_rank,
-                    NODE_WORLD_SIZE=len(self.hostnames_by_rank),
-                    LOCAL_RANK=local_rank,
-                    LOCAL_SIZE=len(ranks))
+                    HOROVOD_CROSS_RANK=node_world_rank,
+                    HOROVOD_CROSS_SIZE=len(self.hostnames_by_rank),
+                    HOROVOD_LOCAL_RANK=local_rank,
+                    HOROVOD_LOCAL_SIZE=len(ranks))
         return rank_to_info
 
     def establish_rendezvous(self) -> Dict[str, str]:
diff --git a/test/single/test_ray.py b/test/single/test_ray.py
index 314f413b5e..9b1dc3e386 100644
--- a/test/single/test_ray.py
+++ b/test/single/test_ray.py
@@ -77,11 +77,13 @@ def test_coordinator_registration():
 
     rank_to_info = coord.finalize_registration()
     assert len(rank_to_info) == len(ranks)
-    assert all(info["NODE_WORLD_SIZE"] == 3 for info in rank_to_info.values())
-    assert {info["NODE_WORLD_RANK"]
+    assert all(
+        info["HOROVOD_CROSS_SIZE"] == 3 for info in rank_to_info.values())
+    assert {info["HOROVOD_CROSS_RANK"]
             for info in rank_to_info.values()} == {0, 1, 2}
-    assert all(info["LOCAL_SIZE"] == 4 for info in rank_to_info.values())
-    assert {info["LOCAL_RANK"]
+    assert all(
+        info["HOROVOD_LOCAL_SIZE"] == 4 for info in rank_to_info.values())
+    assert {info["HOROVOD_LOCAL_RANK"]
             for info in rank_to_info.values()} == {0, 1, 2, 3}
 
 
@@ -243,21 +245,22 @@ def benchmark_step():
         optimizer.step()
 
     time = timeit.timeit(benchmark_step, number=batch_per_iter)
+    return hvd.local_rank()
 
 
 @pytest.mark.skipif(
     not gloo_built(), reason='Gloo is required for Ray integration')
 def test_horovod_train(ray_start_4_cpus):
     def simple_fn(worker):
-        _train()
-        return True
+        local_rank = _train()
+        return local_rank
 
     setting = RayExecutor.create_settings(timeout_s=30)
     hjob = RayExecutor(
         setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
     hjob.start()
     result = hjob.execute(simple_fn)
-    assert all(result)
+    assert set(result) == {0, 1, 2, 3}
     hjob.shutdown()