Skip to content

Commit

Permalink
fix rank assignment for single-process/many-device profiles
Browse files Browse the repository at this point in the history
  • Loading branch information
olupton committed Jun 26, 2024
1 parent fbe36d8 commit cb3c008
Showing 1 changed file with 10 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,18 @@ def _load_nvtx_gpu_proj_trace_single(
# Assume that the thunks are launched from one thread per device, this is probably
# safe. Also, until https://github.com/openxla/xla/pull/14092 is plumbed through,
# assume that thread ID order is local rank order (FIXME!)
tid_to_ordinal = {}
for _, module_df in df[all_thunks].groupby("ProgramId"):
# A given module should have N threads submitting work to N devices, but the
# thread ID submitting work to device 0 is different for N=1 (main thread) and
# N>1 (a worker thread)
for ordinal, tid in enumerate(sorted(module_df["TID"].unique())):
assert tid_to_ordinal.get(tid, ordinal) == ordinal
tid_to_ordinal[tid] = ordinal
# This profile contains ranks [process_index*num_devices, (process_index+1)*num_devices]
unique_tids = df.loc[all_thunks, "TID"].unique()
num_devices = len(unique_tids)
num_devices = len(set(tid_to_ordinal.values()))
df["Rank"] = df["TID"].map(
{
tid: process_index * num_devices + n_tid
for n_tid, tid in enumerate(unique_tids)
}
{k: process_index * num_devices + v for k, v in tid_to_ordinal.items()}
)

if warmup_removal_heuristics:
Expand Down

0 comments on commit cb3c008

Please sign in to comment.