-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] Add Kernel Resource Usage to nvptx-loader #97503
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-libc Author: None (jameshu15869) ChangesThis PR allows Example output:
Full diff: https://github.com/llvm/llvm-project/pull/97503.diff 5 Files Affected:
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index d167abcaf2db1..4790e55bec478 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -19,9 +19,12 @@ function(add_benchmark benchmark_name)
LINK_LIBRARIES
LibcGpuBenchmark.hermetic
${BENCHMARK_LINK_LIBRARIES}
+ LOADER_ARGS
+ "--print-resource-usage"
${BENCHMARK_UNPARSED_ARGUMENTS}
)
get_fq_target_name(${benchmark_name} fq_target_name)
+
add_dependencies(gpu-benchmark ${fq_target_name})
endfunction(add_benchmark)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index fbeec32883b63..52bc2ad03a30d 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -709,12 +709,22 @@ function(add_libc_hermetic test_name)
$<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
add_custom_target(
${fq_target_name}
+ DEPENDS ${fq_target_name}-cmd
+ )
+
+ add_custom_command(
+ OUTPUT ${fq_target_name}-cmd
COMMAND ${test_cmd}
COMMAND_EXPAND_LISTS
COMMENT "Running hermetic test ${fq_target_name}"
${LIBC_HERMETIC_TEST_JOB_POOL}
)
+ set_source_files_properties(${fq_target_name}-cmd
+ PROPERTIES
+ SYMBOLIC "TRUE"
+ )
+
add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
# If it is a benchmark, it will already have been added to the
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index eae2776b2773f..f576c58d902a1 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -28,6 +28,7 @@ struct LaunchParameters {
uint32_t num_blocks_x;
uint32_t num_blocks_y;
uint32_t num_blocks_z;
+ bool print_resource_usage;
};
/// The arguments to the '_begin' kernel.
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index b711ec91c9f30..dfaee4d857826 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -20,7 +20,8 @@
int main(int argc, char **argv, char **envp) {
if (argc < 2) {
- printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
+ printf("USAGE: ./loader [--threads <n>, --blocks <n>, "
+ "--print-resource-usage] <device_image> "
"<args>, ...\n");
return EXIT_SUCCESS;
}
@@ -62,6 +63,9 @@ int main(int argc, char **argv, char **envp) {
offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
offset++;
continue;
+ } else if (argv[offset] == std::string("--print-resource-usage")) {
+ params.print_resource_usage = true;
+ continue;
} else {
file = fopen(argv[offset], "r");
if (!file) {
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 012cb778ecf15..90e52ddb008da 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -229,6 +229,17 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
return CUDA_SUCCESS;
}
+void print_resource_usage(CUmodule binary, const char *kernel_name) {
+ CUfunction function;
+ if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+ handle_error(err);
+ int num_regs;
+ if (CUresult err =
+ cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
+ handle_error(err);
+ fprintf(stderr, "%6s registers: %d\n", kernel_name, num_regs);
+}
+
int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters ¶ms) {
if (CUresult err = cuInit(0))
@@ -341,6 +352,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);
+ // Print resource usage if requested.
+ if (params.print_resource_usage) {
+ print_resource_usage(binary, "_begin");
+ print_resource_usage(binary, "_start");
+ print_resource_usage(binary, "_end");
+ }
+
end_args_t fini_args = {host_ret};
if (CUresult err = launch_kernel(binary, stream, rpc_device,
single_threaded_params, "_end", fini_args))
|
libc/utils/gpu/loader/Loader.h
Outdated
@@ -28,6 +28,7 @@ struct LaunchParameters { | |||
uint32_t num_blocks_x; | |||
uint32_t num_blocks_y; | |||
uint32_t num_blocks_z; | |||
bool print_resource_usage; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't a launch parameter, it should just be a separate option. Honestly I'm wondering if I should just port this to use the LLVM commandline handling now that it's always built w/ LLVM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, so you mean that checking resource usage shouldn't require launching a kernel, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, launch_parameters
are specifically arguments to the kernel launch. The API expects stuff like the number of threads, blocks, dynamic memory, the stream, etc. This isn't related to that since the API doesn't use it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, that makes sense. I can move resource usage logic out of the kernel launch logic
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move this to a separate global and pass it to the function. Honestly I'm thinking I should just rewrite this to be a pure LLVM utility and just use their https://clang.llvm.org/docs/ClangCommandLineReference.html interface.
@@ -341,6 +352,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, | |||
if (CUresult err = cuStreamSynchronize(stream)) | |||
handle_error(err); | |||
|
|||
// Print resource usage if requested. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just make it do this when it's called.
libc/benchmarks/gpu/CMakeLists.txt
Outdated
# We want to dump kernel resource usage for GPU benchmarks | ||
get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE") | ||
set(res_usage_cmd $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> | ||
${CMAKE_CROSSCOMPILING_EMULATOR} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's this do here? I was actually looking into working with this at some point, since it will allow me to test the other targets in the future.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I misinterpreted what we talked about before and made nvptx-loader --print-resource-usage
only print and disable running the kernel - This was like the 2nd cmd that you run for nvptx-loader
after launching the kernel just to see the register usage
libc/utils/gpu/loader/Loader.h
Outdated
@@ -28,6 +28,7 @@ struct LaunchParameters { | |||
uint32_t num_blocks_x; | |||
uint32_t num_blocks_y; | |||
uint32_t num_blocks_z; | |||
bool print_resource_usage; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move this to a separate global and pass it to the function. Honestly I'm thinking I should just rewrite this to be a pure LLVM utility and just use their https://clang.llvm.org/docs/ClangCommandLineReference.html interface.
@@ -326,6 +326,11 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, | |||
return HSA_STATUS_SUCCESS; | |||
} | |||
|
|||
void print_resources(void *image) { | |||
fprintf(stderr, "Printing resource usage on AMDGPU is not supported yet.\n"); | |||
exit(EXIT_FAILURE); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't make it fail, it'll just make it more annoying when we try to run benchmarks on both.
printf("%6s registers: %d\n", kernel_name, num_regs); | ||
} | ||
|
||
void print_resources(void *image) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this function should just print something like
Executing kernel <name>:
blah blah blah
And that logic is guarded in the launch_kernel
function. So, more of a diagnostic thing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, so you still want kernels to launch normally, right? I think I misinterpreted one of your other comments and made this a separate thing (e.g. nvptx-loader --print-resource-usage
just prints the registers without running the kernel
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I figured it would just be an option that the benchmarks use to print verbose information on kernel launch so the user can see it. However, it's definitely true that it's not objectively useful if it includes all the benchmarking overhead. So we may need some name arguments if someone wants to make a dummy kernel or something.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similar to that, do we want benchmarks to always print resource usage? Or should we expect users to manually call nvptx-loader
to see the kernel register usage?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can just make them always print the information.
33d986a
to
34ae76a
Compare
@@ -553,7 +553,7 @@ function(add_libc_hermetic test_name) | |||
endif() | |||
cmake_parse_arguments( | |||
"HERMETIC_TEST" | |||
"IS_BENCHMARK" # Optional arguments | |||
"IS_GPU_BENCHMARK" # Optional arguments |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we handle this with LOADER_ARGS and LINK_LIBRARIES?
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/2053 Here is the relevant piece of the build log for the reference:
|
This PR allows `nvptx-loader` to read the resource usage of `_start`, `_begin`, and `_end` when executing CUDA binaries. Example output: ``` $ nvptx-loader --print-resource-usage libc/benchmarks/gpu/src/ctype/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__ [ RUN ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper [ OK ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper: 93 cycles, 76 min, 470 max, 23 iterations, 78000 ns, 80 stddev _begin registers: 25 _start registers: 80 _end registers: 62 ``` --------- Co-authored-by: Joseph Huber <[email protected]>
Summary: This PR allows `nvptx-loader` to read the resource usage of `_start`, `_begin`, and `_end` when executing CUDA binaries. Example output: ``` $ nvptx-loader --print-resource-usage libc/benchmarks/gpu/src/ctype/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__ [ RUN ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper [ OK ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper: 93 cycles, 76 min, 470 max, 23 iterations, 78000 ns, 80 stddev _begin registers: 25 _start registers: 80 _end registers: 62 ``` --------- Co-authored-by: Joseph Huber <[email protected]> Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251005
This PR allows
nvptx-loader
to read the resource usage of_start
,_begin
, and_end
when executing CUDA binaries.Example output: