Skip to content

Commit

Permalink
Fix mishandling All-to-All communication
Browse files Browse the repository at this point in the history
  • Loading branch information
JoongunPark committed Oct 14, 2024
1 parent f256593 commit 97ef455
Showing 1 changed file with 9 additions and 0 deletions.
9 changes: 9 additions & 0 deletions src/converter/pytorch_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,11 @@ def get_protobuf_node_type_from_json_node(
return COMM_SEND_NODE
if "recv" in keyword:
return COMM_RECV_NODE
# In NCCL, all-to-all communication is implemented using point-to-point
# communications. More details can be found here:
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
if "nccl:all_to_all" in keyword:
return COMM_COLL_NODE
if "ncclKernel" in json_node.name or "ncclDevKernel" in json_node.name:
return COMM_COLL_NODE
return COMP_NODE
Expand Down Expand Up @@ -379,6 +384,10 @@ def get_collective_comm_type(self, name: str) -> int:
for key in comm_type_mapping:
if key in normalized_name:
return comm_type_mapping[key]
# If both COMM_COLL_NAME and ncclDevKernel_SendRecv are present, this is nccl:all_to_all.
if "ncclDevKernel_SendRecv" in name:
return comm_type_mapping["alltoall"]

raise ValueError(
f"The name '{name}' does not correspond to a recognized collective communication type. "
"The converter determines collective communication types based on the node name of a GPU operator. "
Expand Down

0 comments on commit 97ef455

Please sign in to comment.