Skip to content

Commit

Permalink
Check dyno and dcgm existence before disable them (#2496)
Browse files Browse the repository at this point in the history
Summary:
For systems without dyno or dcgm installed and running without sudo, the `ncu_rep` metric will get stuck asking for a sudo password.

This PR checks the command or service existence before disabling them to avoid getting stuck.

Pull Request resolved: #2496

Reviewed By: xuzhao9

Differential Revision: D64141793

Pulled By: FindHao

fbshipit-source-id: 8d52468f04e7e5a0e8d23f3562a14c83d4a5934c
  • Loading branch information
FindHao authored and facebook-github-bot committed Oct 10, 2024
1 parent 7742ef2 commit dcd3d31
Showing 1 changed file with 23 additions and 7 deletions.
30 changes: 23 additions & 7 deletions torchbenchmark/util/triton_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import tabulate
import torch
import triton

from torchbenchmark.util.env_check import fresh_triton_cache, set_random_seed
from torchbenchmark.util.experiment.metrics import get_peak_memory
from torchbenchmark.util.extra_args import apply_decoration_args, parse_decoration_args
Expand Down Expand Up @@ -1008,6 +1009,7 @@ def nsys_rep(self, input_id: int, fn_name: str) -> str:
def ncu_trace(
self, input_id: int, fn_name: str, replay: bool = False, profile_ir=False
) -> str:
import shutil
import subprocess

# collect the ncu trace
Expand All @@ -1031,6 +1033,7 @@ def ncu_trace(
"_ncu_trace_in_task",
]
)

# Disable DCGM
disable_dyno_dcgm = [
"sudo",
Expand All @@ -1045,13 +1048,26 @@ def ncu_trace(
"stop",
"nvidia-dcgm",
]
if (
subprocess.run(disable_dyno_dcgm).returncode != 0
and subprocess.run(disable_dcgm_service).returncode != 0
):
warnings.warn(
"DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..."
)

def service_exists(service_name):
try:
result = subprocess.run(
["systemctl", "status", service_name],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
return result.returncode == 0
except subprocess.CalledProcessError:
return False

if shutil.which("dyno") or service_exists("nvidia-dcgm"):
dyno_result = subprocess.run(disable_dyno_dcgm).returncode
systemctl_result = subprocess.run(disable_dcgm_service).returncode
if dyno_result != 0 and systemctl_result != 0:
warnings.warn(
"DCGM may not have been successfully disabled. Proceeding to collect NCU trace anyway..."
)
ncu_output_dir = self.get_temp_path(f"ncu_traces/{fn_name}_{input_id}")
ncu_output_dir.mkdir(parents=True, exist_ok=True)
ext = ".csv" if not replay else ".ncu-rep"
Expand Down

0 comments on commit dcd3d31

Please sign in to comment.