diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index a434a0089d5f94..86df4584db0914 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3264,8 +3264,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy { } if (DeviceNode != Node) continue; - + void *DevicePtr = (void *)Event->memory_fault.virtual_address; + std::string S; + llvm::raw_string_ostream OS(S); + OS << llvm::format("Memory access fault by GPU %" PRIu32 + " (agent 0x%" PRIx64 + ") at virtual address %p. Reasons: %s", + Node, Event->memory_fault.agent.handle, + (void *)Event->memory_fault.virtual_address, + llvm::join(Reasons, ", ").c_str()); ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord); + ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S, + /*Abort*/ true); } // Abort the execution since we do not recover from this error. diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h index e557b32c2c24f8..8478977a8f86af 100644 --- a/offload/plugins-nextgen/common/include/ErrorReporting.h +++ b/offload/plugins-nextgen/common/include/ErrorReporting.h @@ -157,10 +157,13 @@ class ErrorReporter { if (ATI->HostPtr) print(BoldLightPurple, - "Last allocation of size %lu for host pointer %p:\n", ATI->Size, - ATI->HostPtr); + "Last allocation of size %lu for host pointer %p -> device pointer " + "%p:\n", + ATI->Size, ATI->HostPtr, ATI->DevicePtr); else - print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size); + print(BoldLightPurple, + "Last allocation of size %lu -> device pointer %p:\n", ATI->Size, + ATI->DevicePtr); reportStackTrace(ATI->AllocationTrace); if (!ATI->LastAllocationInfo) return; @@ -174,10 +177,13 @@ class ErrorReporter { ATI->Size); reportStackTrace(ATI->DeallocationTrace); if (ATI->HostPtr) - print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n", - I, ATI->HostPtr); + print( + BoldLightPurple, + " #%u Prior allocation for host pointer %p -> device pointer %p:\n", + I, ATI->HostPtr, ATI->DevicePtr); else - print(BoldLightPurple, " #%u Prior allocation:\n", I); + print(BoldLightPurple, " #%u Prior allocation -> device pointer %p:\n", + I, ATI->DevicePtr); reportStackTrace(ATI->AllocationTrace); ++I; } @@ -219,6 +225,55 @@ class ErrorReporter { #undef DEALLOCATION_ERROR } + static void reportMemoryAccessError(GenericDeviceTy &Device, void *DevicePtr, + std::string &ErrorStr, bool Abort) { + reportError(ErrorStr.c_str()); + + if (!Device.OMPX_TrackAllocationTraces) { + print(Yellow, "Use '%s=true' to track device allocations\n", + Device.OMPX_TrackAllocationTraces.getName().data()); + if (Abort) + abortExecution(); + return; + } + uintptr_t Distance = false; + auto *ATI = + Device.getClosestAllocationTraceInfoForAddr(DevicePtr, Distance); + if (!ATI) { + print(Cyan, + "No host-issued allocations; device pointer %p might be " + "a global, stack, or shared location\n", + DevicePtr); + if (Abort) + abortExecution(); + return; + } + if (!Distance) { + print(Cyan, "Device pointer %p points into%s host-issued allocation:\n", + DevicePtr, ATI->DeallocationTrace.empty() ? "" : " prior"); + reportAllocationInfo(ATI); + if (Abort) + abortExecution(); + return; + } + + bool IsClose = Distance < (1L << 29L /*512MB=*/); + print(Cyan, + "Device pointer %p does not point into any (current or prior) " + "host-issued allocation%s.\n", + DevicePtr, + IsClose ? "" : " (might be a global, stack, or shared location)"); + if (IsClose) { + print(Cyan, + "Closest host-issued allocation (distance %" PRIuPTR + " byte%s; might be by page):\n", + Distance, Distance > 1 ? "s" : ""); + reportAllocationInfo(ATI); + } + if (Abort) + abortExecution(); + } + /// Report that a kernel encountered a trap instruction. static void reportTrapInKernel( GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR, diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 81823338fe2112..7e3e788fa52dc9 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -938,6 +938,42 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// been deallocated, both for error reporting purposes. ProtectedObj> AllocationTraces; + /// Return the allocation trace info for a device pointer, that is the + /// allocation into which this device pointer points to (or pointed into). + AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) { + auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); + for (auto &It : *AllocationTraceMap) { + if (It.first <= DevicePtr && + advanceVoidPtr(It.first, It.second->Size) > DevicePtr) + return It.second; + } + return nullptr; + } + + /// Return the allocation trace info for a device pointer, that is the + /// allocation into which this device pointer points to (or pointed into). + AllocationTraceInfoTy * + getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) { + Distance = 0; + if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) { + return ATI; + } + + AllocationTraceInfoTy *ATI = nullptr; + uintptr_t DevicePtrI = uintptr_t(DevicePtr); + auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); + for (auto &It : *AllocationTraceMap) { + uintptr_t Begin = uintptr_t(It.second->DevicePtr); + uintptr_t End = Begin + It.second->Size - 1; + uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End); + if (ATI && ItDistance > Distance) + continue; + ATI = It.second; + Distance = ItDistance; + } + return ATI; + } + /// Map to record kernel have been launchedl, for error reporting purposes. ProtectedObj KernelLaunchTraces; @@ -946,6 +982,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy { UInt32Envar OMPX_TrackNumKernelLaunches = UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0); + /// Environment variable to determine if stack traces for allocations and + /// deallocations are tracked. + BoolEnvar OMPX_TrackAllocationTraces = + BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false); + private: /// Get and set the stack size and heap size for the device. If not used, the /// plugin can implement the setters as no-op and setting the output @@ -996,11 +1037,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { UInt32Envar OMPX_InitialNumStreams; UInt32Envar OMPX_InitialNumEvents; - /// Environment variable to determine if stack traces for allocations and - /// deallocations are tracked. - BoolEnvar OMPX_TrackAllocationTraces = - BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false); - /// Array of images loaded into the device. Images are automatically /// deallocated by the allocator. llvm::SmallVector LoadedImages; diff --git a/offload/test/sanitizer/double_free.c b/offload/test/sanitizer/double_free.c index ca7310e34fc9d0..a3d8b06f1c7381 100644 --- a/offload/test/sanitizer/double_free.c +++ b/offload/test/sanitizer/double_free.c @@ -36,7 +36,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:24 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main @@ -49,7 +49,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:22 // -// CHECK: #0 Prior allocation: +// CHECK: #0 Prior allocation -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main @@ -61,7 +61,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:20 // -// CHECK: #1 Prior allocation: +// CHECK: #1 Prior allocation -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main diff --git a/offload/test/sanitizer/double_free_racy.c b/offload/test/sanitizer/double_free_racy.c index 3b4f2d5c51571c..4ebd8f36efa10c 100644 --- a/offload/test/sanitizer/double_free_racy.c +++ b/offload/test/sanitizer/double_free_racy.c @@ -28,6 +28,6 @@ int main(void) { // CHECK: dataDelete // CHECK: omp_target_free -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.c b/offload/test/sanitizer/free_wrong_ptr_kind.c index 0c178541db1170..7c5a4ff7085024 100644 --- a/offload/test/sanitizer/free_wrong_ptr_kind.c +++ b/offload/test/sanitizer/free_wrong_ptr_kind.c @@ -28,7 +28,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}free_wrong_ptr_kind.c:22 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: llvm_omp_target_alloc_host // NDEBG: main diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.cpp b/offload/test/sanitizer/free_wrong_ptr_kind.cpp index 87a52c5d4baf23..7ebb8c438433a9 100644 --- a/offload/test/sanitizer/free_wrong_ptr_kind.cpp +++ b/offload/test/sanitizer/free_wrong_ptr_kind.cpp @@ -31,7 +31,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}free_wrong_ptr_kind.cpp:25 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: llvm_omp_target_alloc_shared // NDEBG: main diff --git a/offload/test/sanitizer/ptr_outside_alloc_1.c b/offload/test/sanitizer/ptr_outside_alloc_1.c new file mode 100644 index 00000000000000..38742b783e8e9b --- /dev/null +++ b/offload/test/sanitizer/ptr_outside_alloc_1.c @@ -0,0 +1,40 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); +void llvm_omp_target_free_host(void *Ptr, int DeviceNum); + +int main() { + int N = (1 << 30); + char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device()); + char *P; +#pragma omp target map(from : P) + { + P = &A[0]; + *P = 3; + } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations +// TRACE: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation. +// TRACE: Closest host-issued allocation (distance 4096 bytes; might be by page): +// TRACE: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { P[-4] = 5; } + + llvm_omp_target_free_host(A, omp_get_default_device()); +} diff --git a/offload/test/sanitizer/ptr_outside_alloc_2.c b/offload/test/sanitizer/ptr_outside_alloc_2.c new file mode 100644 index 00000000000000..ac47c8922f09ef --- /dev/null +++ b/offload/test/sanitizer/ptr_outside_alloc_2.c @@ -0,0 +1,26 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +int main() { + int N = (1 << 30); + char *A = (char *)malloc(N); +#pragma omp target map(A[ : N]) + { A[N] = 3; } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// CHECK: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation. +// CHECK: Closest host-issued allocation (distance 1 byte; might be by page): +// CHECK: Last allocation of size 1073741824 +// clang-format on +} diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c new file mode 100644 index 00000000000000..cebcdee1803475 --- /dev/null +++ b/offload/test/sanitizer/use_after_free_1.c @@ -0,0 +1,39 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); +void llvm_omp_target_free_host(void *Ptr, int DeviceNum); + +int main() { + int N = (1 << 30); + char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device()); + char *P; +#pragma omp target map(from : P) + { + P = &A[N / 2]; + *P = 3; + } + llvm_omp_target_free_host(A, omp_get_default_device()); + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations +// TRACE: Device pointer [[PTR]] points into prior host-issued allocation: +// TRACE: Last deallocation: +// TRACE: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { *P = 5; } +} diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c new file mode 100644 index 00000000000000..587d04a6ff3528 --- /dev/null +++ b/offload/test/sanitizer/use_after_free_2.c @@ -0,0 +1,32 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +int main() { + int N = (1 << 30); + char *A = (char *)malloc(N); + char *P; +#pragma omp target map(A[ : N]) map(from : P) + { + P = &A[N / 2]; + *P = 3; + } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// CHECK: Device pointer [[PTR]] points into prior host-issued allocation: +// CHECK: Last deallocation: +// CHECK: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { *P = 5; } +}