Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libc] Add Kernel Resource Usage to nvptx-loader #97503

Merged
merged 5 commits into from
Jul 17, 2024

Conversation

jameshu15869
Copy link
Contributor

This PR allows nvptx-loader to read the resource usage of _start, _begin, and _end when executing CUDA binaries.

Example output:

$ nvptx-loader --print-resource-usage libc/benchmarks/gpu/src/ctype/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__
[ RUN      ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper
[       OK ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper: 93 cycles, 76 min, 470 max, 23 iterations, 78000 ns, 80 stddev
_begin registers: 25
_start registers: 80
_end registers: 62

@llvmbot llvmbot added the libc label Jul 3, 2024
@llvmbot
Copy link
Collaborator

llvmbot commented Jul 3, 2024

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-libc

Author: None (jameshu15869)

Changes

This PR allows nvptx-loader to read the resource usage of _start, _begin, and _end when executing CUDA binaries.

Example output:

$ nvptx-loader --print-resource-usage libc/benchmarks/gpu/src/ctype/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__
[ RUN      ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper
[       OK ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper: 93 cycles, 76 min, 470 max, 23 iterations, 78000 ns, 80 stddev
_begin registers: 25
_start registers: 80
_end registers: 62

Full diff: https://github.com/llvm/llvm-project/pull/97503.diff

5 Files Affected:

  • (modified) libc/benchmarks/gpu/CMakeLists.txt (+3)
  • (modified) libc/cmake/modules/LLVMLibCTestRules.cmake (+10)
  • (modified) libc/utils/gpu/loader/Loader.h (+1)
  • (modified) libc/utils/gpu/loader/Main.cpp (+5-1)
  • (modified) libc/utils/gpu/loader/nvptx/Loader.cpp (+18)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index d167abcaf2db1..4790e55bec478 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -19,9 +19,12 @@ function(add_benchmark benchmark_name)
     LINK_LIBRARIES
       LibcGpuBenchmark.hermetic
       ${BENCHMARK_LINK_LIBRARIES}
+    LOADER_ARGS
+      "--print-resource-usage"
     ${BENCHMARK_UNPARSED_ARGUMENTS}
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
+
   add_dependencies(gpu-benchmark ${fq_target_name})
 endfunction(add_benchmark)
 
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index fbeec32883b63..52bc2ad03a30d 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -709,12 +709,22 @@ function(add_libc_hermetic test_name)
       $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
   add_custom_target(
     ${fq_target_name}
+    DEPENDS ${fq_target_name}-cmd
+  )
+
+  add_custom_command(
+    OUTPUT ${fq_target_name}-cmd
     COMMAND ${test_cmd}
     COMMAND_EXPAND_LISTS
     COMMENT "Running hermetic test ${fq_target_name}"
     ${LIBC_HERMETIC_TEST_JOB_POOL}
   )
 
+  set_source_files_properties(${fq_target_name}-cmd
+    PROPERTIES
+      SYMBOLIC "TRUE"
+  )
+
   add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name})
   if(NOT ${HERMETIC_TEST_IS_BENCHMARK})
     # If it is a benchmark, it will already have been added to the
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index eae2776b2773f..f576c58d902a1 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -28,6 +28,7 @@ struct LaunchParameters {
   uint32_t num_blocks_x;
   uint32_t num_blocks_y;
   uint32_t num_blocks_z;
+  bool print_resource_usage;
 };
 
 /// The arguments to the '_begin' kernel.
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index b711ec91c9f30..dfaee4d857826 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -20,7 +20,8 @@
 
 int main(int argc, char **argv, char **envp) {
   if (argc < 2) {
-    printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
+    printf("USAGE: ./loader [--threads <n>, --blocks <n>, "
+           "--print-resource-usage] <device_image> "
            "<args>, ...\n");
     return EXIT_SUCCESS;
   }
@@ -62,6 +63,9 @@ int main(int argc, char **argv, char **envp) {
           offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
       offset++;
       continue;
+    } else if (argv[offset] == std::string("--print-resource-usage")) {
+      params.print_resource_usage = true;
+      continue;
     } else {
       file = fopen(argv[offset], "r");
       if (!file) {
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 012cb778ecf15..90e52ddb008da 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -229,6 +229,17 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   return CUDA_SUCCESS;
 }
 
+void print_resource_usage(CUmodule binary, const char *kernel_name) {
+  CUfunction function;
+  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+    handle_error(err);
+  int num_regs;
+  if (CUresult err =
+          cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
+    handle_error(err);
+  fprintf(stderr, "%6s registers: %d\n", kernel_name, num_regs);
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
   if (CUresult err = cuInit(0))
@@ -341,6 +352,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);
 
+  // Print resource usage if requested.
+  if (params.print_resource_usage) {
+    print_resource_usage(binary, "_begin");
+    print_resource_usage(binary, "_start");
+    print_resource_usage(binary, "_end");
+  }
+
   end_args_t fini_args = {host_ret};
   if (CUresult err = launch_kernel(binary, stream, rpc_device,
                                    single_threaded_params, "_end", fini_args))

@@ -28,6 +28,7 @@ struct LaunchParameters {
uint32_t num_blocks_x;
uint32_t num_blocks_y;
uint32_t num_blocks_z;
bool print_resource_usage;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't a launch parameter, it should just be a separate option. Honestly I'm wondering if I should just port this to use the LLVM commandline handling now that it's always built w/ LLVM.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, so you mean that checking resource usage shouldn't require launching a kernel, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, launch_parameters are specifically arguments to the kernel launch. The API expects stuff like the number of threads, blocks, dynamic memory, the stream, etc. This isn't related to that since the API doesn't use it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that makes sense. I can move resource usage logic out of the kernel launch logic

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this to a separate global and pass it to the function. Honestly I'm thinking I should just rewrite this to be a pure LLVM utility and just use their https://clang.llvm.org/docs/ClangCommandLineReference.html interface.

libc/utils/gpu/loader/nvptx/Loader.cpp Outdated Show resolved Hide resolved
@@ -341,6 +352,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);

// Print resource usage if requested.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just make it do this when it's called.

# We want to dump kernel resource usage for GPU benchmarks
get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
set(res_usage_cmd $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
${CMAKE_CROSSCOMPILING_EMULATOR}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's this do here? I was actually looking into working with this at some point, since it will allow me to test the other targets in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I misinterpreted what we talked about before and made nvptx-loader --print-resource-usage only print and disable running the kernel - This was like the 2nd cmd that you run for nvptx-loader after launching the kernel just to see the register usage

@@ -28,6 +28,7 @@ struct LaunchParameters {
uint32_t num_blocks_x;
uint32_t num_blocks_y;
uint32_t num_blocks_z;
bool print_resource_usage;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this to a separate global and pass it to the function. Honestly I'm thinking I should just rewrite this to be a pure LLVM utility and just use their https://clang.llvm.org/docs/ClangCommandLineReference.html interface.

@@ -326,6 +326,11 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
return HSA_STATUS_SUCCESS;
}

void print_resources(void *image) {
fprintf(stderr, "Printing resource usage on AMDGPU is not supported yet.\n");
exit(EXIT_FAILURE);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't make it fail, it'll just make it more annoying when we try to run benchmarks on both.

printf("%6s registers: %d\n", kernel_name, num_regs);
}

void print_resources(void *image) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this function should just print something like

Executing kernel <name>:
  blah blah blah

And that logic is guarded in the launch_kernel function. So, more of a diagnostic thing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, so you still want kernels to launch normally, right? I think I misinterpreted one of your other comments and made this a separate thing (e.g. nvptx-loader --print-resource-usage just prints the registers without running the kernel

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I figured it would just be an option that the benchmarks use to print verbose information on kernel launch so the user can see it. However, it's definitely true that it's not objectively useful if it includes all the benchmarking overhead. So we may need some name arguments if someone wants to make a dummy kernel or something.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to that, do we want benchmarks to always print resource usage? Or should we expect users to manually call nvptx-loader to see the kernel register usage?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can just make them always print the information.

@@ -553,7 +553,7 @@ function(add_libc_hermetic test_name)
endif()
cmake_parse_arguments(
"HERMETIC_TEST"
"IS_BENCHMARK" # Optional arguments
"IS_GPU_BENCHMARK" # Optional arguments
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we handle this with LOADER_ARGS and LINK_LIBRARIES?

@jhuber6 jhuber6 merged commit 1ecffda into llvm:main Jul 17, 2024
4 of 5 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Jul 17, 2024

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building libc at step 5 "compile-openmp".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/2053

Here is the relevant piece of the build log for the reference:

Step 5 (compile-openmp) failure: build (failure)
...
0.532 [3626/14/592] Linking CXX static library lib/libLibcTableGenUtil.a
0.532 [3625/14/593] Linking CXX static library lib/libllvm_gtest_main.a
0.532 [3625/13/594] Linking CXX static library lib/libLLVMDebugInfoCodeView.a
0.608 [3625/12/595] Linking CXX executable bin/UnicodeNameMappingGenerator
0.618 [3625/11/596] Linking CXX executable bin/split-file
0.642 [3625/10/597] Linking CXX executable bin/not
0.644 [3625/9/598] Linking CXX executable bin/reduce-chunk-list
0.648 [3625/8/599] Linking CXX executable bin/yaml-bench
0.668 [3625/7/600] Linking CXX executable bin/llvm-undname
0.675 [3625/6/601] Building CXX object projects/libc/utils/gpu/loader/amdgpu/CMakeFiles/amdhsa-loader.dir/Loader.cpp.o
FAILED: projects/libc/utils/gpu/loader/amdgpu/CMakeFiles/amdhsa-loader.dir/Loader.cpp.o 
ccache /usr/bin/c++ -DLIBC_COPT_ARRAY_ARG_LIST -DLIBC_COPT_PRINTF_DISABLE_INDEX_MODE -DLIBC_COPT_PRINTF_DISABLE_WRITE_INT -DLIBC_COPT_USE_C_ASSERT -DLIBC_NAMESPACE=__llvm_libc_19_0_0_git -I/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/projects/libc/utils/gpu/loader/amdgpu -I/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu -I/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader -I/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/include -I/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc -I/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/server -isystem /opt/rocm/include -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -fno-lifetime-dse -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wimplicit-fallthrough -Wno-uninitialized -Wno-nonnull -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wno-misleading-indentation -fdiagnostics-color -ffunction-sections -fdata-sections -O3 -DNDEBUG -UNDEBUG -Wno-attributes -std=c++17 -MD -MT projects/libc/utils/gpu/loader/amdgpu/CMakeFiles/amdhsa-loader.dir/Loader.cpp.o -MF projects/libc/utils/gpu/loader/amdgpu/CMakeFiles/amdhsa-loader.dir/Loader.cpp.o.d -o projects/libc/utils/gpu/loader/amdgpu/CMakeFiles/amdhsa-loader.dir/Loader.cpp.o -c /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp: In function ‘void print_kernel_resources(char*)’:
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:129:11: error: cannot convert ‘const char*’ to ‘FILE*’ {aka ‘_IO_FILE*’}
  129 |   fprintf("Kernel resources on AMDGPU is not supported yet.\n");
      |           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |           |
      |           const char*
In file included from /usr/include/stdio.h:867,
                 from /usr/include/c++/9/cstdio:42,
                 from /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/Loader.h:19,
                 from /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:16:
/usr/include/x86_64-linux-gnu/bits/stdio2.h:98:27: note:   initializing argument 1 of ‘int fprintf(FILE*, const char*, ...)’
   98 | fprintf (FILE *__restrict __stream, const char *__restrict __fmt, ...)
      |          ~~~~~~~~~~~~~~~~~^~~~~~~~
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp: In instantiation of ‘hsa_status_t launch_kernel(hsa_agent_t, hsa_executable_t, hsa_amd_memory_pool_t, hsa_amd_memory_pool_t, hsa_queue_t*, rpc_device_t, const LaunchParameters&, const char*, args_t, bool) [with args_t = begin_args_t; hsa_agent_t = hsa_agent_s; hsa_executable_t = hsa_executable_s; hsa_amd_memory_pool_t = hsa_amd_memory_pool_s; hsa_queue_t = hsa_queue_s; rpc_device_t = rpc_device_s]’:
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:559:71:   required from here
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:279:28: error: invalid conversion from ‘const char*’ to ‘char*’ [-fpermissive]
  279 |     print_kernel_resources(kernel_name);
      |                            ^~~~~~~~~~~
      |                            |
      |                            const char*
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:128:35: note:   initializing argument 1 of ‘void print_kernel_resources(char*)’
  128 | void print_kernel_resources(char *kernel_name) {
      |                             ~~~~~~^~~~~~~~~~~
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp: In instantiation of ‘hsa_status_t launch_kernel(hsa_agent_t, hsa_executable_t, hsa_amd_memory_pool_t, hsa_amd_memory_pool_t, hsa_queue_t*, rpc_device_t, const LaunchParameters&, const char*, args_t, bool) [with args_t = start_args_t; hsa_agent_t = hsa_agent_s; hsa_executable_t = hsa_executable_s; hsa_amd_memory_pool_t = hsa_amd_memory_pool_s; hsa_queue_t = hsa_queue_s; rpc_device_t = rpc_device_s]’:
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:565:66:   required from here
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:279:28: error: invalid conversion from ‘const char*’ to ‘char*’ [-fpermissive]
  279 |     print_kernel_resources(kernel_name);
      |                            ^~~~~~~~~~~
      |                            |
      |                            const char*
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:128:35: note:   initializing argument 1 of ‘void print_kernel_resources(char*)’
  128 | void print_kernel_resources(char *kernel_name) {
      |                             ~~~~~~^~~~~~~~~~~
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp: In instantiation of ‘hsa_status_t launch_kernel(hsa_agent_t, hsa_executable_t, hsa_amd_memory_pool_t, hsa_amd_memory_pool_t, hsa_queue_t*, rpc_device_t, const LaunchParameters&, const char*, args_t, bool) [with args_t = end_args_t; hsa_agent_t = hsa_agent_s; hsa_executable_t = hsa_executable_s; hsa_amd_memory_pool_t = hsa_amd_memory_pool_s; hsa_queue_t = hsa_queue_s; rpc_device_t = rpc_device_s]’:
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:586:71:   required from here
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/libc/utils/gpu/loader/amdgpu/Loader.cpp:279:28: error: invalid conversion from ‘const char*’ to ‘char*’ [-fpermissive]
  279 |     print_kernel_resources(kernel_name);

sgundapa pushed a commit to sgundapa/upstream_effort that referenced this pull request Jul 23, 2024
This PR allows `nvptx-loader` to read the resource usage of `_start`,
`_begin`, and `_end` when executing CUDA binaries.

Example output:
```
$ nvptx-loader --print-resource-usage libc/benchmarks/gpu/src/ctype/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__
[ RUN      ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper
[       OK ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper: 93 cycles, 76 min, 470 max, 23 iterations, 78000 ns, 80 stddev
_begin registers: 25
_start registers: 80
_end registers: 62
  ```

---------

Co-authored-by: Joseph Huber <[email protected]>
yuxuanchen1997 pushed a commit that referenced this pull request Jul 25, 2024
Summary:
This PR allows `nvptx-loader` to read the resource usage of `_start`,
`_begin`, and `_end` when executing CUDA binaries.

Example output:
```
$ nvptx-loader --print-resource-usage libc/benchmarks/gpu/src/ctype/libc.benchmarks.gpu.src.ctype.isalnum_benchmark.__build__
[ RUN      ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper
[       OK ] LlvmLibcIsAlNumGpuBenchmark.IsAlnumWrapper: 93 cycles, 76 min, 470 max, 23 iterations, 78000 ns, 80 stddev
_begin registers: 25
_start registers: 80
_end registers: 62
  ```

---------

Co-authored-by: Joseph Huber <[email protected]>

Test Plan: 

Reviewers: 

Subscribers: 

Tasks: 

Tags: 


Differential Revision: https://phabricator.intern.facebook.com/D60251005
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants