Improve AMD GPU support parity (asyncGpuComm, more math functions) (#…

…22791) This PR: * Updates things so the `asyncGpuComm` proc in the GPU module now works when using `CHPL_GPU=amd`. For some reason `hipify-perl` struggles to find the right API to translate the cuda code in our runtime lib for the underlying CUDA API call we make for this (hence why I didn't just get to work from the "get go". But replacing it with the "obvious" API (using `hipMemcpyAsync` in place of `cuMemcpyAsync`) seems to work. * Fixes a bug in how we filter out what extern functions we consider linking to when doing the GPU pass of code generation. We're supposed to limit ourselves to `__device__` functions but for some reason some non device math functions from the C++ standard lib were getting through and this was causing issues for us if we ever called the corresponding functions in GPU code. I've fixed things so these functions now filter out and we now support the same set of math functions as we do for NVIDIA. * Remove `test/gpu/native/studies/shoc/triadchpl.skipif`, I'm not sure why this was skipped before but it seems to work now. * Update our worksharing tests to avoid this bug (#22736) and remove the .skipif files so we now test them with `CHPL_GPU=amd` TODO: * [X] Paratest AMD * [x] Paratest NVIDIA * [X] Paratest NVIDIA (gasnet) * [x] Paratest AMD (AOD) [Reviewed by @DanilaFe]
chapel-lang · Jul 24, 2023 · 5044eca · 5044eca
2 parents 97bc17f + 98f7850
commit 5044eca
Show file tree

Hide file tree

Showing 9 changed files with 44 additions and 80 deletions.
diff --git a/compiler/llvm/clangUtil.cpp b/compiler/llvm/clangUtil.cpp
@@ -1315,8 +1315,12 @@ class CCodeGenConsumer final : public ASTConsumer {
 
     bool shouldHandleDecl(Decl* d) {
       if (gCodegenGPU) {
-        //this decl must have __device__
-        return d->hasAttr<CUDADeviceAttr>();
+        // this decl must have __device__ and it must be explicit (for some
+        // reason odd reason, with AMD code generation we see am implicit use of
+        // __device__ on some math functions in the C stdlib that really aren't
+        // supposed to have them and this causes linker issues later on).
+        return d->hasAttr<CUDADeviceAttr>() &&
+          !d->getAttr<CUDADeviceAttr>()->isImplicit();
       }
       else {
         // this decl either doesn't have __device__, or if it has, it also has a

diff --git a/runtime/src/gpu/amd/gpu-amd.c b/runtime/src/gpu/amd/gpu-amd.c
@@ -394,12 +394,10 @@ void chpl_gpu_impl_copy_device_to_device(void* dst, const void* src, size_t n) {
 
 
 void* chpl_gpu_impl_comm_async(void *dst, void *src, size_t n) {
-/*  hipStream_t stream;
+  hipStream_t stream;
   hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
-  cuMemcpyAsync((hipDeviceptr_t)dst, (hipDeviceptr_t)src, n, stream);
-  return stream;*/
-  assert(false);
-  return NULL;
+  hipMemcpyAsync((hipDeviceptr_t)dst, (hipDeviceptr_t)src, n, hipMemcpyDefault, stream);
+  return stream;
 }
 
 void chpl_gpu_impl_comm_wait(void *stream) {

diff --git a/test/gpu/native/array_on_device/studies/shoc/triadAodAsync.skipif b/test/gpu/native/array_on_device/studies/shoc/triadAodAsync.skipif
diff --git a/test/gpu/native/math.chpl b/test/gpu/native/math.chpl
@@ -82,40 +82,28 @@ foreach i in r do  R[0] = abs(c128);   R[1] = abs(c128);  check(R,R);
 */
 
 foreach i in r do  R[0] = acos(r32)  ;   R[1] = acos(r32);  check(R,"acos(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = acos(r64)  ;   R[1] = acos(r64);  check(R,"acos(r64)");
-}
+foreach i in r do  R[0] = acos(r64)  ;   R[1] = acos(r64);  check(R,"acos(r64)");
+
 foreach i in r do  R[0] = acosh(r32b);   R[1] = acosh(r32b);  check(R,"acosh(r32b)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = acosh(r64b);   R[1] = acosh(r64b);  check(R,"acosh(r64b)");
-}
+foreach i in r do  R[0] = acosh(r64b);   R[1] = acosh(r64b);  check(R,"acosh(r64b)");
 
 foreach i in r do  R[0] = asin(r32) ;   R[1] = asin(r32);  check(R,"asin(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = asin(r64) ;   R[1] = asin(r64);  check(R,"asin(r64)");
-}
+foreach i in r do  R[0] = asin(r64) ;   R[1] = asin(r64);  check(R,"asin(r64)");
+
 foreach i in r do  R[0] = asinh(r32);   R[1] = asinh(r32);  check(R,"asinh(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = asinh(r64);   R[1] = asinh(r64);  check(R,"asinh(r64)");
-}
+foreach i in r do  R[0] = asinh(r64);   R[1] = asinh(r64);  check(R,"asinh(r64)");
 
 foreach i in r do  R[0] = atan(r32)     ; R[1] = atan(r32);  check(R,"atan(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = atan(r64)     ; R[1] = atan(r64);  check(R,"atan(r64)");
-}
+foreach i in r do  R[0] = atan(r64)     ; R[1] = atan(r64);  check(R,"atan(r64)");
+
 foreach i in r do  R[0] = atan2(r32,r32); R[1] = atan2(r32,r32);  check(R,"atan2(r32,r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = atan2(r64,r64); R[1] = atan2(r64,r64);  check(R,"atan2(r64,r64)");
-}
+foreach i in r do  R[0] = atan2(r64,r64); R[1] = atan2(r64,r64);  check(R,"atan2(r64,r64)");
+
 foreach i in r do  R[0] = atanh(r32)    ; R[1] = atanh(r32);  check(R,"atanh(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = atanh(r64)    ; R[1] = atanh(r64);  check(R,"atanh(r64)");
-}
+foreach i in r do  R[0] = atanh(r64)    ; R[1] = atanh(r64);  check(R,"atanh(r64)");
 
 foreach i in r do  R[0] = cbrt(r32);   R[1] = cbrt(r32);  check(R,"cbrt(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = cbrt(r64);   R[1] = cbrt(r64);  check(R,"cbrt(r64)");
-}
+foreach i in r do  R[0] = cbrt(r64);   R[1] = cbrt(r64);  check(R,"cbrt(r64)");
 
 foreach i in r do  R[0] = ceil(r32);   R[1] = ceil(r32);  check(R,"ceil(r32)");
 foreach i in r do  R[0] = ceil(r64);   R[1] = ceil(r64);  check(R,"ceil(r64)");
@@ -136,9 +124,7 @@ foreach i in r do  R[0] = conjg(r64 ); R[1] = conjg(r64);  check(R,"conjg(r64)")
 foreach i in r do  R[0] =  cos(r32) ; R[1] =  cos(r32);  check(R,"cos(r32)");
 foreach i in r do  R[0] =  cos(r64) ; R[1] =  cos(r64);  check(R,"cos(r64)");
 foreach i in r do  R[0] =  cosh(r32); R[1] =  cosh(r32);  check(R,"cosh(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] =  cosh(r64); R[1] =  cosh(r64);  check(R,"cosh(r64)");
-}
+foreach i in r do  R[0] =  cosh(r64); R[1] =  cosh(r64);  check(R,"cosh(r64)");
 
 // the following are implemented in Chapel, but let's test them here for completeness
 foreach i in r do R[0] = divceil(i8 ,i8 );     R[1] = divceil(i8 ,i8 );     check(R,"divceil(i8,i8)");
@@ -167,13 +153,9 @@ foreach i in r do R[0] = divfloorpos(i32,i32); R[1] = divfloorpos(i32,i32); chec
 foreach i in r do R[0] = divfloorpos(i64,i64); R[1] = divfloorpos(i64,i64); check(R,"divfloorpos(i64,i64)");
 
 foreach i in r do  R[0] =  erf(r32) ;   R[1] =  erf(r32);  check(R,"erf(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] =  erf(r64) ;   R[1] =  erf(r64);  check(R,"erf(r64)");
-}
+foreach i in r do  R[0] =  erf(r64) ;   R[1] =  erf(r64);  check(R,"erf(r64)");
 foreach i in r do  R[0] =  erfc(r32);   R[1] =  erfc(r32);  check(R,"erfc(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] =  erfc(r64);   R[1] =  erfc(r64);  check(R,"erfc(r64)");
-}
+foreach i in r do  R[0] =  erfc(r64);   R[1] =  erfc(r64);  check(R,"erfc(r64)");
 foreach i in r do  R[0] =  exp(r32) ;   R[1] =  exp(r32);  check(R,"exp(r32)");
 foreach i in r do  R[0] =  exp(r64) ;   R[1] =  exp(r64);  check(R,"exp(r64)");
 foreach i in r do  R[0] =  exp2(r32);   R[1] =  exp2(r32);  check(R,"exp2(r32)");
@@ -197,14 +179,10 @@ foreach i in r do  B[0] =  isnan(r32);   B[1] =  isnan(r32);  check(B,"isnan(r32
 foreach i in r do  B[0] =  isnan(r64);   B[1] =  isnan(r64);  check(B,"isnan(r64)");
 
 foreach i in r do  R[0] = ldexp(r32,i32);   R[1] =  ldexp(r32,i32);  check(R,"ldexp(r32,i32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = ldexp(r64,i32);   R[1] =  ldexp(r64,i32);  check(R,"ldexp(r64,i32)");
-}
+foreach i in r do  R[0] = ldexp(r64,i32);   R[1] =  ldexp(r64,i32);  check(R,"ldexp(r64,i32)");
 
 foreach i in r do  R[0] = lgamma(r32);   R[1] =  lgamma(r32);  check(R,"lgamma(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = lgamma(r64);   R[1] =  lgamma(r64);  check(R,"lgamma(r64)");
-}
+foreach i in r do  R[0] = lgamma(r64);   R[1] =  lgamma(r64);  check(R,"lgamma(r64)");
 
 foreach i in r do  R[0] = log(r32);   R[1] =  log(r32);  check(R,"log(r32)");
 foreach i in r do  R[0] = log(r64);   R[1] =  log(r64);  check(R,"log(r64)");
@@ -213,9 +191,7 @@ foreach i in r do  R[0] = log10(r32);   R[1] =  log10(r32);  check(R,"log10(r32)
 foreach i in r do  R[0] = log10(r64);   R[1] =  log10(r64);  check(R,"log10(r64)");
 
 foreach i in r do  R[0] = log1p(r32);   R[1] =  log1p(r32);  check(R,"log1p(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] = log1p(r64);   R[1] =  log1p(r64);  check(R,"log1p(r64)");
-}
+foreach i in r do  R[0] = log1p(r64);   R[1] =  log1p(r64);  check(R,"log1p(r64)");
 
 /* TODO runtime support
 foreach i in r do  R[0] = logBasePow2(i8 ,2);   R[1] = logBasePow2(i8 ,2);  check(R,"logBasePow2(i8 ,2)");
@@ -295,26 +271,18 @@ foreach i in r do  B[0] =  signbit(r64);   B[1] =  signbit(r64);  check(B,"signb
 foreach i in r do  R[0] =  sin(r32);   R[1] =  sin(r32);  check(R,"sin(r32)");
 foreach i in r do  R[0] =  sin(r64);   R[1] =  sin(r64);  check(R,"sin(r64)");
 foreach i in r do  R[0] =  sinh(r32);   R[1] =  sinh(r32);  check(R,"sinh(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] =  sinh(r64);   R[1] =  sinh(r64);  check(R,"sinh(r64)");
-}
+foreach i in r do  R[0] =  sinh(r64);   R[1] =  sinh(r64);  check(R,"sinh(r64)");
 
 foreach i in r do  R[0] =  sqrt(r32);   R[1] =  sqrt(r32);  check(R,"sqrt(r32)");
 foreach i in r do  R[0] =  sqrt(r64);   R[1] =  sqrt(r64);  check(R,"sqrt(r64)");
 
 foreach i in r do  R[0] =  tan(r32);   R[1] =  tan(r32);  check(R,"tan(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] =  tan(r64);   R[1] =  tan(r64);  check(R,"tan(r64)");
-}
+foreach i in r do  R[0] =  tan(r64);   R[1] =  tan(r64);  check(R,"tan(r64)");
 foreach i in r do  R[0] =  tanh(r32);   R[1] =  tanh(r32);  check(R,"tanh(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] =  tanh(r64);   R[1] =  tanh(r64);  check(R,"tanh(r64)");
-}
+foreach i in r do  R[0] =  tanh(r64);   R[1] =  tanh(r64);  check(R,"tanh(r64)");
 
 foreach i in r do  R[0] =  tgamma(r32);   R[1] =  tgamma(r32);  check(R,"tgamma(r32)");
-if(!excludeForRocm) {
-  foreach i in r do  R[0] =  tgamma(r64);   R[1] =  tgamma(r64);  check(R,"tgamma(r64)");
-}
+foreach i in r do  R[0] =  tgamma(r64);   R[1] =  tgamma(r64);  check(R,"tgamma(r64)");
 
 foreach i in r do  R[0] =  trunc(r32);   R[1] =  trunc(r32);  check(R,"trunc(r32)");
 foreach i in r do  R[0] =  trunc(r64);   R[1] =  trunc(r64);  check(R,"trunc(r64)");

diff --git a/test/gpu/native/multiGPU/worksharing.chpl b/test/gpu/native/multiGPU/worksharing.chpl
@@ -66,8 +66,14 @@ assert(nLaunch == here.gpus.size*numIters);
 
 writeln(A);
 
-if validate then
-  assert(n*(1+alpha*2) == + reduce A);
+if validate {
+  // Reduction done "manually" due to this bug:
+  // https://github.com/chapel-lang/chapel/issues/22736
+  var AReduce = 0;
+  for a in A do AReduce += a;
+
+  assert(n*(1+alpha*2) == AReduce);
+}
 
 if printStats {
   writeln("Performance (GB/s) = ", 3* numBytes(int) * n * 1e-9 / minTime );

diff --git a/test/gpu/native/multiGPU/worksharing.skipif b/test/gpu/native/multiGPU/worksharing.skipif
diff --git a/test/gpu/native/multiGPU/worksharingBasic.chpl b/test/gpu/native/multiGPU/worksharingBasic.chpl
@@ -32,4 +32,8 @@ writeln(A);
 const nLaunch = getGpuDiagnostics().kernel_launch;
 
 assert(nLaunch == here.gpus.size);
-assert((+ reduce A) == n);
+// Reduction done "manually" due to this bug:
+// https://github.com/chapel-lang/chapel/issues/22736
+var AReduce = 0;
+for a in A do AReduce += a;
+assert(AReduce == n);
diff --git a/test/gpu/native/multiGPU/worksharingBasic.skipif b/test/gpu/native/multiGPU/worksharingBasic.skipif
diff --git a/test/gpu/native/studies/shoc/triadchpl.skipif b/test/gpu/native/studies/shoc/triadchpl.skipif