From 445e9dbdeea3e7de38e4933e21f87d01c87c03e3 Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Wed, 21 Aug 2024 10:01:35 -0700 Subject: [PATCH] [Offload] Improve error reporting on memory faults (#104254) Since we can already track allocations, we can diagnose memory faults to some degree. If the fault happens in a prior allocation (use after free) or "close but outside" one, we can provide that information to the user. Note that the fault address might be page aligned, and not all accesses trigger a fault, especially for allocations that are backed by a MemoryManager. Still, if people disable the MemoryManager or the allocation is big enough, we can sometimes provide valueable feedback. --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 12 +++- .../common/include/ErrorReporting.h | 67 +++++++++++++++++-- .../common/include/PluginInterface.h | 46 +++++++++++-- offload/test/sanitizer/double_free.c | 6 +- offload/test/sanitizer/double_free_racy.c | 2 +- offload/test/sanitizer/free_wrong_ptr_kind.c | 2 +- .../test/sanitizer/free_wrong_ptr_kind.cpp | 2 +- offload/test/sanitizer/ptr_outside_alloc_1.c | 40 +++++++++++ offload/test/sanitizer/ptr_outside_alloc_2.c | 26 +++++++ offload/test/sanitizer/use_after_free_1.c | 39 +++++++++++ offload/test/sanitizer/use_after_free_2.c | 32 +++++++++ 11 files changed, 256 insertions(+), 18 deletions(-) create mode 100644 offload/test/sanitizer/ptr_outside_alloc_1.c create mode 100644 offload/test/sanitizer/ptr_outside_alloc_2.c create mode 100644 offload/test/sanitizer/use_after_free_1.c create mode 100644 offload/test/sanitizer/use_after_free_2.c diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index a434a0089d5f94..86df4584db0914 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3264,8 +3264,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy { } if (DeviceNode != Node) continue; - + void *DevicePtr = (void *)Event->memory_fault.virtual_address; + std::string S; + llvm::raw_string_ostream OS(S); + OS << llvm::format("Memory access fault by GPU %" PRIu32 + " (agent 0x%" PRIx64 + ") at virtual address %p. Reasons: %s", + Node, Event->memory_fault.agent.handle, + (void *)Event->memory_fault.virtual_address, + llvm::join(Reasons, ", ").c_str()); ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord); + ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S, + /*Abort*/ true); } // Abort the execution since we do not recover from this error. diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h index e557b32c2c24f8..8478977a8f86af 100644 --- a/offload/plugins-nextgen/common/include/ErrorReporting.h +++ b/offload/plugins-nextgen/common/include/ErrorReporting.h @@ -157,10 +157,13 @@ class ErrorReporter { if (ATI->HostPtr) print(BoldLightPurple, - "Last allocation of size %lu for host pointer %p:\n", ATI->Size, - ATI->HostPtr); + "Last allocation of size %lu for host pointer %p -> device pointer " + "%p:\n", + ATI->Size, ATI->HostPtr, ATI->DevicePtr); else - print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size); + print(BoldLightPurple, + "Last allocation of size %lu -> device pointer %p:\n", ATI->Size, + ATI->DevicePtr); reportStackTrace(ATI->AllocationTrace); if (!ATI->LastAllocationInfo) return; @@ -174,10 +177,13 @@ class ErrorReporter { ATI->Size); reportStackTrace(ATI->DeallocationTrace); if (ATI->HostPtr) - print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n", - I, ATI->HostPtr); + print( + BoldLightPurple, + " #%u Prior allocation for host pointer %p -> device pointer %p:\n", + I, ATI->HostPtr, ATI->DevicePtr); else - print(BoldLightPurple, " #%u Prior allocation:\n", I); + print(BoldLightPurple, " #%u Prior allocation -> device pointer %p:\n", + I, ATI->DevicePtr); reportStackTrace(ATI->AllocationTrace); ++I; } @@ -219,6 +225,55 @@ class ErrorReporter { #undef DEALLOCATION_ERROR } + static void reportMemoryAccessError(GenericDeviceTy &Device, void *DevicePtr, + std::string &ErrorStr, bool Abort) { + reportError(ErrorStr.c_str()); + + if (!Device.OMPX_TrackAllocationTraces) { + print(Yellow, "Use '%s=true' to track device allocations\n", + Device.OMPX_TrackAllocationTraces.getName().data()); + if (Abort) + abortExecution(); + return; + } + uintptr_t Distance = false; + auto *ATI = + Device.getClosestAllocationTraceInfoForAddr(DevicePtr, Distance); + if (!ATI) { + print(Cyan, + "No host-issued allocations; device pointer %p might be " + "a global, stack, or shared location\n", + DevicePtr); + if (Abort) + abortExecution(); + return; + } + if (!Distance) { + print(Cyan, "Device pointer %p points into%s host-issued allocation:\n", + DevicePtr, ATI->DeallocationTrace.empty() ? "" : " prior"); + reportAllocationInfo(ATI); + if (Abort) + abortExecution(); + return; + } + + bool IsClose = Distance < (1L << 29L /*512MB=*/); + print(Cyan, + "Device pointer %p does not point into any (current or prior) " + "host-issued allocation%s.\n", + DevicePtr, + IsClose ? "" : " (might be a global, stack, or shared location)"); + if (IsClose) { + print(Cyan, + "Closest host-issued allocation (distance %" PRIuPTR + " byte%s; might be by page):\n", + Distance, Distance > 1 ? "s" : ""); + reportAllocationInfo(ATI); + } + if (Abort) + abortExecution(); + } + /// Report that a kernel encountered a trap instruction. static void reportTrapInKernel( GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR, diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 81823338fe2112..7e3e788fa52dc9 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -938,6 +938,42 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// been deallocated, both for error reporting purposes. ProtectedObj> AllocationTraces; + /// Return the allocation trace info for a device pointer, that is the + /// allocation into which this device pointer points to (or pointed into). + AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) { + auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); + for (auto &It : *AllocationTraceMap) { + if (It.first <= DevicePtr && + advanceVoidPtr(It.first, It.second->Size) > DevicePtr) + return It.second; + } + return nullptr; + } + + /// Return the allocation trace info for a device pointer, that is the + /// allocation into which this device pointer points to (or pointed into). + AllocationTraceInfoTy * + getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) { + Distance = 0; + if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) { + return ATI; + } + + AllocationTraceInfoTy *ATI = nullptr; + uintptr_t DevicePtrI = uintptr_t(DevicePtr); + auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor(); + for (auto &It : *AllocationTraceMap) { + uintptr_t Begin = uintptr_t(It.second->DevicePtr); + uintptr_t End = Begin + It.second->Size - 1; + uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End); + if (ATI && ItDistance > Distance) + continue; + ATI = It.second; + Distance = ItDistance; + } + return ATI; + } + /// Map to record kernel have been launchedl, for error reporting purposes. ProtectedObj KernelLaunchTraces; @@ -946,6 +982,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy { UInt32Envar OMPX_TrackNumKernelLaunches = UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0); + /// Environment variable to determine if stack traces for allocations and + /// deallocations are tracked. + BoolEnvar OMPX_TrackAllocationTraces = + BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false); + private: /// Get and set the stack size and heap size for the device. If not used, the /// plugin can implement the setters as no-op and setting the output @@ -996,11 +1037,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { UInt32Envar OMPX_InitialNumStreams; UInt32Envar OMPX_InitialNumEvents; - /// Environment variable to determine if stack traces for allocations and - /// deallocations are tracked. - BoolEnvar OMPX_TrackAllocationTraces = - BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false); - /// Array of images loaded into the device. Images are automatically /// deallocated by the allocator. llvm::SmallVector LoadedImages; diff --git a/offload/test/sanitizer/double_free.c b/offload/test/sanitizer/double_free.c index ca7310e34fc9d0..a3d8b06f1c7381 100644 --- a/offload/test/sanitizer/double_free.c +++ b/offload/test/sanitizer/double_free.c @@ -36,7 +36,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:24 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main @@ -49,7 +49,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:22 // -// CHECK: #0 Prior allocation: +// CHECK: #0 Prior allocation -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main @@ -61,7 +61,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}double_free.c:20 // -// CHECK: #1 Prior allocation: +// CHECK: #1 Prior allocation -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc // NDEBG: main diff --git a/offload/test/sanitizer/double_free_racy.c b/offload/test/sanitizer/double_free_racy.c index 3b4f2d5c51571c..4ebd8f36efa10c 100644 --- a/offload/test/sanitizer/double_free_racy.c +++ b/offload/test/sanitizer/double_free_racy.c @@ -28,6 +28,6 @@ int main(void) { // CHECK: dataDelete // CHECK: omp_target_free -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: omp_target_alloc diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.c b/offload/test/sanitizer/free_wrong_ptr_kind.c index 0c178541db1170..7c5a4ff7085024 100644 --- a/offload/test/sanitizer/free_wrong_ptr_kind.c +++ b/offload/test/sanitizer/free_wrong_ptr_kind.c @@ -28,7 +28,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}free_wrong_ptr_kind.c:22 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: llvm_omp_target_alloc_host // NDEBG: main diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.cpp b/offload/test/sanitizer/free_wrong_ptr_kind.cpp index 87a52c5d4baf23..7ebb8c438433a9 100644 --- a/offload/test/sanitizer/free_wrong_ptr_kind.cpp +++ b/offload/test/sanitizer/free_wrong_ptr_kind.cpp @@ -31,7 +31,7 @@ int main(void) { // NDEBG: main // DEBUG: main {{.*}}free_wrong_ptr_kind.cpp:25 // -// CHECK: Last allocation of size 8: +// CHECK: Last allocation of size 8 -> device pointer // CHECK: dataAlloc // CHECK: llvm_omp_target_alloc_shared // NDEBG: main diff --git a/offload/test/sanitizer/ptr_outside_alloc_1.c b/offload/test/sanitizer/ptr_outside_alloc_1.c new file mode 100644 index 00000000000000..38742b783e8e9b --- /dev/null +++ b/offload/test/sanitizer/ptr_outside_alloc_1.c @@ -0,0 +1,40 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); +void llvm_omp_target_free_host(void *Ptr, int DeviceNum); + +int main() { + int N = (1 << 30); + char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device()); + char *P; +#pragma omp target map(from : P) + { + P = &A[0]; + *P = 3; + } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations +// TRACE: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation. +// TRACE: Closest host-issued allocation (distance 4096 bytes; might be by page): +// TRACE: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { P[-4] = 5; } + + llvm_omp_target_free_host(A, omp_get_default_device()); +} diff --git a/offload/test/sanitizer/ptr_outside_alloc_2.c b/offload/test/sanitizer/ptr_outside_alloc_2.c new file mode 100644 index 00000000000000..ac47c8922f09ef --- /dev/null +++ b/offload/test/sanitizer/ptr_outside_alloc_2.c @@ -0,0 +1,26 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +int main() { + int N = (1 << 30); + char *A = (char *)malloc(N); +#pragma omp target map(A[ : N]) + { A[N] = 3; } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// CHECK: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation. +// CHECK: Closest host-issued allocation (distance 1 byte; might be by page): +// CHECK: Last allocation of size 1073741824 +// clang-format on +} diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c new file mode 100644 index 00000000000000..cebcdee1803475 --- /dev/null +++ b/offload/test/sanitizer/use_after_free_1.c @@ -0,0 +1,39 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum); +void llvm_omp_target_free_host(void *Ptr, int DeviceNum); + +int main() { + int N = (1 << 30); + char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device()); + char *P; +#pragma omp target map(from : P) + { + P = &A[N / 2]; + *P = 3; + } + llvm_omp_target_free_host(A, omp_get_default_device()); + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations +// TRACE: Device pointer [[PTR]] points into prior host-issued allocation: +// TRACE: Last deallocation: +// TRACE: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { *P = 5; } +} diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c new file mode 100644 index 00000000000000..587d04a6ff3528 --- /dev/null +++ b/offload/test/sanitizer/use_after_free_2.c @@ -0,0 +1,32 @@ +// clang-format off +// RUN: %libomptarget-compileopt-generic +// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK +// clang-format on + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: s390x-ibm-linux-gnu +// UNSUPPORTED: s390x-ibm-linux-gnu-LTO + +#include + +int main() { + int N = (1 << 30); + char *A = (char *)malloc(N); + char *P; +#pragma omp target map(A[ : N]) map(from : P) + { + P = &A[N / 2]; + *P = 3; + } + // clang-format off +// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}} +// CHECK: Device pointer [[PTR]] points into prior host-issued allocation: +// CHECK: Last deallocation: +// CHECK: Last allocation of size 1073741824 +// clang-format on +#pragma omp target + { *P = 5; } +}