Skip to content

Commit

Permalink
Improve AMD GPU support parity (asyncGpuComm, more math functions) (#…
Browse files Browse the repository at this point in the history
…22791)

This PR:

* Updates things so the `asyncGpuComm` proc in the GPU module now works
when using `CHPL_GPU=amd`. For some reason `hipify-perl` struggles to
find the right API to translate the cuda code in our runtime lib for the
underlying CUDA API call we make for this (hence why I didn't just get
to work from the "get go". But replacing it with the "obvious" API
(using `hipMemcpyAsync` in place of `cuMemcpyAsync`) seems to work.

* Fixes a bug in how we filter out what extern functions we consider
linking to when doing the GPU pass of code generation. We're supposed to
limit ourselves to `__device__` functions but for some reason some non
device math functions from the C++ standard lib were getting through and
this was causing issues for us if we ever called the corresponding
functions in GPU code. I've fixed things so these functions now filter
out and we now support the same set of math functions as we do for
NVIDIA.

* Remove `test/gpu/native/studies/shoc/triadchpl.skipif`, I'm not sure
why this was skipped before but it seems to work now.
* Update our worksharing tests to avoid this bug
(#22736) and remove the
.skipif files so we now test them with `CHPL_GPU=amd`

TODO:
* [X] Paratest AMD
* [x] Paratest NVIDIA
* [X] Paratest NVIDIA (gasnet)
* [x] Paratest AMD (AOD)

[Reviewed by @DanilaFe]
  • Loading branch information
stonea authored Jul 24, 2023
2 parents 97bc17f + 98f7850 commit 5044eca
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 80 deletions.
8 changes: 6 additions & 2 deletions compiler/llvm/clangUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1315,8 +1315,12 @@ class CCodeGenConsumer final : public ASTConsumer {

bool shouldHandleDecl(Decl* d) {
if (gCodegenGPU) {
//this decl must have __device__
return d->hasAttr<CUDADeviceAttr>();
// this decl must have __device__ and it must be explicit (for some
// reason odd reason, with AMD code generation we see am implicit use of
// __device__ on some math functions in the C stdlib that really aren't
// supposed to have them and this causes linker issues later on).
return d->hasAttr<CUDADeviceAttr>() &&
!d->getAttr<CUDADeviceAttr>()->isImplicit();
}
else {
// this decl either doesn't have __device__, or if it has, it also has a
Expand Down
8 changes: 3 additions & 5 deletions runtime/src/gpu/amd/gpu-amd.c
Original file line number Diff line number Diff line change
Expand Up @@ -394,12 +394,10 @@ void chpl_gpu_impl_copy_device_to_device(void* dst, const void* src, size_t n) {


void* chpl_gpu_impl_comm_async(void *dst, void *src, size_t n) {
/* hipStream_t stream;
hipStream_t stream;
hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
cuMemcpyAsync((hipDeviceptr_t)dst, (hipDeviceptr_t)src, n, stream);
return stream;*/
assert(false);
return NULL;
hipMemcpyAsync((hipDeviceptr_t)dst, (hipDeviceptr_t)src, n, hipMemcpyDefault, stream);
return stream;
}

void chpl_gpu_impl_comm_wait(void *stream) {
Expand Down

This file was deleted.

76 changes: 22 additions & 54 deletions test/gpu/native/math.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -82,40 +82,28 @@ foreach i in r do R[0] = abs(c128); R[1] = abs(c128); check(R,R);
*/

foreach i in r do R[0] = acos(r32) ; R[1] = acos(r32); check(R,"acos(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = acos(r64) ; R[1] = acos(r64); check(R,"acos(r64)");
}
foreach i in r do R[0] = acos(r64) ; R[1] = acos(r64); check(R,"acos(r64)");

foreach i in r do R[0] = acosh(r32b); R[1] = acosh(r32b); check(R,"acosh(r32b)");
if(!excludeForRocm) {
foreach i in r do R[0] = acosh(r64b); R[1] = acosh(r64b); check(R,"acosh(r64b)");
}
foreach i in r do R[0] = acosh(r64b); R[1] = acosh(r64b); check(R,"acosh(r64b)");

foreach i in r do R[0] = asin(r32) ; R[1] = asin(r32); check(R,"asin(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = asin(r64) ; R[1] = asin(r64); check(R,"asin(r64)");
}
foreach i in r do R[0] = asin(r64) ; R[1] = asin(r64); check(R,"asin(r64)");

foreach i in r do R[0] = asinh(r32); R[1] = asinh(r32); check(R,"asinh(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = asinh(r64); R[1] = asinh(r64); check(R,"asinh(r64)");
}
foreach i in r do R[0] = asinh(r64); R[1] = asinh(r64); check(R,"asinh(r64)");

foreach i in r do R[0] = atan(r32) ; R[1] = atan(r32); check(R,"atan(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = atan(r64) ; R[1] = atan(r64); check(R,"atan(r64)");
}
foreach i in r do R[0] = atan(r64) ; R[1] = atan(r64); check(R,"atan(r64)");

foreach i in r do R[0] = atan2(r32,r32); R[1] = atan2(r32,r32); check(R,"atan2(r32,r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = atan2(r64,r64); R[1] = atan2(r64,r64); check(R,"atan2(r64,r64)");
}
foreach i in r do R[0] = atan2(r64,r64); R[1] = atan2(r64,r64); check(R,"atan2(r64,r64)");

foreach i in r do R[0] = atanh(r32) ; R[1] = atanh(r32); check(R,"atanh(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = atanh(r64) ; R[1] = atanh(r64); check(R,"atanh(r64)");
}
foreach i in r do R[0] = atanh(r64) ; R[1] = atanh(r64); check(R,"atanh(r64)");

foreach i in r do R[0] = cbrt(r32); R[1] = cbrt(r32); check(R,"cbrt(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = cbrt(r64); R[1] = cbrt(r64); check(R,"cbrt(r64)");
}
foreach i in r do R[0] = cbrt(r64); R[1] = cbrt(r64); check(R,"cbrt(r64)");

foreach i in r do R[0] = ceil(r32); R[1] = ceil(r32); check(R,"ceil(r32)");
foreach i in r do R[0] = ceil(r64); R[1] = ceil(r64); check(R,"ceil(r64)");
Expand All @@ -136,9 +124,7 @@ foreach i in r do R[0] = conjg(r64 ); R[1] = conjg(r64); check(R,"conjg(r64)")
foreach i in r do R[0] = cos(r32) ; R[1] = cos(r32); check(R,"cos(r32)");
foreach i in r do R[0] = cos(r64) ; R[1] = cos(r64); check(R,"cos(r64)");
foreach i in r do R[0] = cosh(r32); R[1] = cosh(r32); check(R,"cosh(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = cosh(r64); R[1] = cosh(r64); check(R,"cosh(r64)");
}
foreach i in r do R[0] = cosh(r64); R[1] = cosh(r64); check(R,"cosh(r64)");

// the following are implemented in Chapel, but let's test them here for completeness
foreach i in r do R[0] = divceil(i8 ,i8 ); R[1] = divceil(i8 ,i8 ); check(R,"divceil(i8,i8)");
Expand Down Expand Up @@ -167,13 +153,9 @@ foreach i in r do R[0] = divfloorpos(i32,i32); R[1] = divfloorpos(i32,i32); chec
foreach i in r do R[0] = divfloorpos(i64,i64); R[1] = divfloorpos(i64,i64); check(R,"divfloorpos(i64,i64)");

foreach i in r do R[0] = erf(r32) ; R[1] = erf(r32); check(R,"erf(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = erf(r64) ; R[1] = erf(r64); check(R,"erf(r64)");
}
foreach i in r do R[0] = erf(r64) ; R[1] = erf(r64); check(R,"erf(r64)");
foreach i in r do R[0] = erfc(r32); R[1] = erfc(r32); check(R,"erfc(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = erfc(r64); R[1] = erfc(r64); check(R,"erfc(r64)");
}
foreach i in r do R[0] = erfc(r64); R[1] = erfc(r64); check(R,"erfc(r64)");
foreach i in r do R[0] = exp(r32) ; R[1] = exp(r32); check(R,"exp(r32)");
foreach i in r do R[0] = exp(r64) ; R[1] = exp(r64); check(R,"exp(r64)");
foreach i in r do R[0] = exp2(r32); R[1] = exp2(r32); check(R,"exp2(r32)");
Expand All @@ -197,14 +179,10 @@ foreach i in r do B[0] = isnan(r32); B[1] = isnan(r32); check(B,"isnan(r32
foreach i in r do B[0] = isnan(r64); B[1] = isnan(r64); check(B,"isnan(r64)");

foreach i in r do R[0] = ldexp(r32,i32); R[1] = ldexp(r32,i32); check(R,"ldexp(r32,i32)");
if(!excludeForRocm) {
foreach i in r do R[0] = ldexp(r64,i32); R[1] = ldexp(r64,i32); check(R,"ldexp(r64,i32)");
}
foreach i in r do R[0] = ldexp(r64,i32); R[1] = ldexp(r64,i32); check(R,"ldexp(r64,i32)");

foreach i in r do R[0] = lgamma(r32); R[1] = lgamma(r32); check(R,"lgamma(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = lgamma(r64); R[1] = lgamma(r64); check(R,"lgamma(r64)");
}
foreach i in r do R[0] = lgamma(r64); R[1] = lgamma(r64); check(R,"lgamma(r64)");

foreach i in r do R[0] = log(r32); R[1] = log(r32); check(R,"log(r32)");
foreach i in r do R[0] = log(r64); R[1] = log(r64); check(R,"log(r64)");
Expand All @@ -213,9 +191,7 @@ foreach i in r do R[0] = log10(r32); R[1] = log10(r32); check(R,"log10(r32)
foreach i in r do R[0] = log10(r64); R[1] = log10(r64); check(R,"log10(r64)");

foreach i in r do R[0] = log1p(r32); R[1] = log1p(r32); check(R,"log1p(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = log1p(r64); R[1] = log1p(r64); check(R,"log1p(r64)");
}
foreach i in r do R[0] = log1p(r64); R[1] = log1p(r64); check(R,"log1p(r64)");

/* TODO runtime support
foreach i in r do R[0] = logBasePow2(i8 ,2); R[1] = logBasePow2(i8 ,2); check(R,"logBasePow2(i8 ,2)");
Expand Down Expand Up @@ -295,26 +271,18 @@ foreach i in r do B[0] = signbit(r64); B[1] = signbit(r64); check(B,"signb
foreach i in r do R[0] = sin(r32); R[1] = sin(r32); check(R,"sin(r32)");
foreach i in r do R[0] = sin(r64); R[1] = sin(r64); check(R,"sin(r64)");
foreach i in r do R[0] = sinh(r32); R[1] = sinh(r32); check(R,"sinh(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = sinh(r64); R[1] = sinh(r64); check(R,"sinh(r64)");
}
foreach i in r do R[0] = sinh(r64); R[1] = sinh(r64); check(R,"sinh(r64)");

foreach i in r do R[0] = sqrt(r32); R[1] = sqrt(r32); check(R,"sqrt(r32)");
foreach i in r do R[0] = sqrt(r64); R[1] = sqrt(r64); check(R,"sqrt(r64)");

foreach i in r do R[0] = tan(r32); R[1] = tan(r32); check(R,"tan(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = tan(r64); R[1] = tan(r64); check(R,"tan(r64)");
}
foreach i in r do R[0] = tan(r64); R[1] = tan(r64); check(R,"tan(r64)");
foreach i in r do R[0] = tanh(r32); R[1] = tanh(r32); check(R,"tanh(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = tanh(r64); R[1] = tanh(r64); check(R,"tanh(r64)");
}
foreach i in r do R[0] = tanh(r64); R[1] = tanh(r64); check(R,"tanh(r64)");

foreach i in r do R[0] = tgamma(r32); R[1] = tgamma(r32); check(R,"tgamma(r32)");
if(!excludeForRocm) {
foreach i in r do R[0] = tgamma(r64); R[1] = tgamma(r64); check(R,"tgamma(r64)");
}
foreach i in r do R[0] = tgamma(r64); R[1] = tgamma(r64); check(R,"tgamma(r64)");

foreach i in r do R[0] = trunc(r32); R[1] = trunc(r32); check(R,"trunc(r32)");
foreach i in r do R[0] = trunc(r64); R[1] = trunc(r64); check(R,"trunc(r64)");
Expand Down
10 changes: 8 additions & 2 deletions test/gpu/native/multiGPU/worksharing.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,14 @@ assert(nLaunch == here.gpus.size*numIters);

writeln(A);

if validate then
assert(n*(1+alpha*2) == + reduce A);
if validate {
// Reduction done "manually" due to this bug:
// https://github.com/chapel-lang/chapel/issues/22736
var AReduce = 0;
for a in A do AReduce += a;

assert(n*(1+alpha*2) == AReduce);
}

if printStats {
writeln("Performance (GB/s) = ", 3* numBytes(int) * n * 1e-9 / minTime );
Expand Down
5 changes: 0 additions & 5 deletions test/gpu/native/multiGPU/worksharing.skipif

This file was deleted.

6 changes: 5 additions & 1 deletion test/gpu/native/multiGPU/worksharingBasic.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,8 @@ writeln(A);
const nLaunch = getGpuDiagnostics().kernel_launch;

assert(nLaunch == here.gpus.size);
assert((+ reduce A) == n);
// Reduction done "manually" due to this bug:
// https://github.com/chapel-lang/chapel/issues/22736
var AReduce = 0;
for a in A do AReduce += a;
assert(AReduce == n);
5 changes: 0 additions & 5 deletions test/gpu/native/multiGPU/worksharingBasic.skipif

This file was deleted.

5 changes: 0 additions & 5 deletions test/gpu/native/studies/shoc/triadchpl.skipif

This file was deleted.

0 comments on commit 5044eca

Please sign in to comment.