From 365c962995801cf43893f05b8d9950906adf082b Mon Sep 17 00:00:00 2001 From: jameshu15869 <55058507+jameshu15869@users.noreply.github.com> Date: Thu, 18 Jul 2024 07:18:23 -0500 Subject: [PATCH] [libc] Add Multithreaded GPU Benchmarks (#98964) Summary: This PR runs benchmarks on a 32 threads (A single warp on NVPTX) by default, adding the option for single threaded benchmarks. We can specify that a benchmark should be run on a single thread using the `SINGLE_THREADED_BENCHMARK()` macro. I chose to use a flag here so that other options could be added in the future. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250873 --- libc/benchmarks/gpu/CMakeLists.txt | 1 + libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 9 +++++++-- libc/benchmarks/gpu/LibcGpuBenchmark.h | 19 ++++++++++++++++--- libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 2 ++ .../gpu/src/ctype/isalnum_benchmark.cpp | 4 ++++ 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 14ba9f3f64b481..29e27724e1ab39 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -10,6 +10,7 @@ function(add_benchmark benchmark_name) "LINK_LIBRARIES" # Multi-value arguments ${ARGN} ) + if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) message(FATAL_ERROR "target does not support clock") endif() diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 23fff3e8180f7d..c926d8efd7db2b 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -114,8 +114,13 @@ void Benchmark::run_benchmarks() { all_results.reset(); gpu::sync_threads(); - auto current_result = b->run(); - all_results.update(current_result); + if (!b->flags || + ((b->flags & BenchmarkFlags::SINGLE_THREADED) && id == 0) || + ((b->flags & BenchmarkFlags::SINGLE_WAVE) && + id < gpu::get_lane_size())) { + auto current_result = b->run(); + all_results.update(current_result); + } gpu::sync_threads(); if (id == 0) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 1f813f8655de6a..29d7ba8b0a1323 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -74,16 +74,19 @@ struct BenchmarkResult { clock_t total_time = 0; }; +enum BenchmarkFlags { SINGLE_THREADED = 0x1, SINGLE_WAVE = 0x2 }; + BenchmarkResult benchmark(const BenchmarkOptions &options, cpp::function wrapper_func); class Benchmark { const cpp::function func; const cpp::string_view name; + const uint8_t flags; public: - Benchmark(cpp::function func, char const *name) - : func(func), name(name) { + Benchmark(cpp::function func, char const *name, uint8_t flags) + : func(func), name(name), flags(flags) { add_benchmark(this); } @@ -104,6 +107,16 @@ class Benchmark { #define BENCHMARK(SuiteName, TestName, Func) \ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ - Func, #SuiteName "." #TestName) + Func, #SuiteName "." #TestName, 0) + +#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ + LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ + Func, #SuiteName "." #TestName, \ + LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_THREADED) + +#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ + LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ + Func, #SuiteName "." #TestName, \ + LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_WAVE) #endif diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt index 79f01425770da9..f277624dbb9016 100644 --- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt +++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt @@ -8,6 +8,8 @@ add_benchmark( isalnum_benchmark.cpp DEPENDS libc.src.ctype.isalnum + LOADER_ARGS + --threads 64 ) add_benchmark( diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp index 6f8d247902f764..ffa5a99860bfc0 100644 --- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp +++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp @@ -7,6 +7,10 @@ uint64_t BM_IsAlnum() { return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x); } BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnum, BM_IsAlnum); +SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread, + BM_IsAlnum); +SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave, + BM_IsAlnum); uint64_t BM_IsAlnumCapital() { char x = 'A';