Skip to content

Commit

Permalink
[ray] fix local_rank issue (horovod#2596)
Browse files Browse the repository at this point in the history
  • Loading branch information
richardliaw authored and irasit committed Jan 29, 2021
1 parent aec733e commit 86dbd19
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 14 deletions.
8 changes: 5 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Added

### Changed
### Changed

### Deprecated

### Removed

### Fixed

- Fixed `local_rank` support for Ray. ([#2596](https://github.com/horovod/horovod/pull/2596))

## [v0.21.1] - 2021-01-06

### Added
Expand All @@ -32,9 +34,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

- Added knob to set cache hint for oneCCL allreduce. ([#2560](https://github.com/horovod/horovod/pull/2560))

- Renamed `horovodrun` arg `--ccl-bgt-affinity` to `--thread-affinity`. ([#2562](https://github.com/horovod/horovod/pull/2562))
- Renamed `horovodrun` arg `--ccl-bgt-affinity` to `--thread-affinity`. ([#2562](https://github.com/horovod/horovod/pull/2562))

- Changed default build parallelism from `-j8` to `-j1` to address potential race condition. ([#2572](https://github.com/horovod/horovod/pull/2572))
- Changed default build parallelism from `-j8` to `-j1` to address potential race condition. ([#2572](https://github.com/horovod/horovod/pull/2572))

### Fixed

Expand Down
8 changes: 4 additions & 4 deletions horovod/ray/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,10 @@ def finalize_registration(self) -> dict:
self.hostnames_by_rank.items()):
for local_rank, world_rank in enumerate(ranks):
rank_to_info[world_rank] = dict(
NODE_WORLD_RANK=node_world_rank,
NODE_WORLD_SIZE=len(self.hostnames_by_rank),
LOCAL_RANK=local_rank,
LOCAL_SIZE=len(ranks))
HOROVOD_CROSS_RANK=node_world_rank,
HOROVOD_CROSS_SIZE=len(self.hostnames_by_rank),
HOROVOD_LOCAL_RANK=local_rank,
HOROVOD_LOCAL_SIZE=len(ranks))
return rank_to_info

def establish_rendezvous(self) -> Dict[str, str]:
Expand Down
17 changes: 10 additions & 7 deletions test/single/test_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,13 @@ def test_coordinator_registration():

rank_to_info = coord.finalize_registration()
assert len(rank_to_info) == len(ranks)
assert all(info["NODE_WORLD_SIZE"] == 3 for info in rank_to_info.values())
assert {info["NODE_WORLD_RANK"]
assert all(
info["HOROVOD_CROSS_SIZE"] == 3 for info in rank_to_info.values())
assert {info["HOROVOD_CROSS_RANK"]
for info in rank_to_info.values()} == {0, 1, 2}
assert all(info["LOCAL_SIZE"] == 4 for info in rank_to_info.values())
assert {info["LOCAL_RANK"]
assert all(
info["HOROVOD_LOCAL_SIZE"] == 4 for info in rank_to_info.values())
assert {info["HOROVOD_LOCAL_RANK"]
for info in rank_to_info.values()} == {0, 1, 2, 3}


Expand Down Expand Up @@ -243,21 +245,22 @@ def benchmark_step():
optimizer.step()

time = timeit.timeit(benchmark_step, number=batch_per_iter)
return hvd.local_rank()


@pytest.mark.skipif(
not gloo_built(), reason='Gloo is required for Ray integration')
def test_horovod_train(ray_start_4_cpus):
def simple_fn(worker):
_train()
return True
local_rank = _train()
return local_rank

setting = RayExecutor.create_settings(timeout_s=30)
hjob = RayExecutor(
setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
hjob.start()
result = hjob.execute(simple_fn)
assert all(result)
assert set(result) == {0, 1, 2, 3}
hjob.shutdown()


Expand Down

0 comments on commit 86dbd19

Please sign in to comment.