Skip to content

Commit

Permalink
[aDAG] Fix ranks ordering for custom NCCL group (ray-project#47594)
Browse files Browse the repository at this point in the history
The ranks should be in the order of the actors.

Signed-off-by: ujjawal-khare <[email protected]>
  • Loading branch information
ruisearch42 authored and ujjawal-khare committed Oct 15, 2024
1 parent 6a7c9f3 commit 5b2567b
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions python/ray/experimental/channel/torch_tensor_nccl_channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,9 @@ def _get_ranks(
actors: List[ray.actor.ActorHandle], custom_nccl_group: Optional[GPUCommunicator]
) -> List[int]:
"""
Get sorted ranks for the NCCL group to use. If custom_nccl_group is specified,
return all ranks from it, otherwise, return list(range(len(actors))).
Get ranks for the NCCL group to use. If custom_nccl_group is specified,
return the ranks of the actors in the custom NCCL group, in the same
order of the actors; otherwise, return list(range(len(actors))).
Args:
actors: A list of actors that participate in the NCCL group.
Expand All @@ -494,18 +495,18 @@ def _get_ranks(
"The world size of the custom NCCL group does not match the number "
"of actors."
)
ranks = set()
ranks = []
for actor in actors:
rank = custom_nccl_group.get_rank(actor)
assert rank not in ranks, "Duplicate rank in custom NCCL group"
ranks.add(rank)
ranks.append(rank)
assert custom_nccl_group.get_world_size() == len(actors), (
"The world size of the custom NCCL group "
f"({custom_nccl_group.get_world_size()}) "
"does not match the number of actors "
f"({len(actors)})."
)
return sorted(ranks)
return ranks


def _init_nccl_group(
Expand Down

0 comments on commit 5b2567b

Please sign in to comment.