Skip to content

Commit

Permalink
[Offload] Improve error reporting on memory faults (llvm#104254)
Browse files Browse the repository at this point in the history
Since we can already track allocations, we can diagnose memory faults to
some degree. If the fault happens in a prior allocation (use after free)
or "close but outside" one, we can provide that information to the user.
Note that the fault address might be page aligned, and not all accesses
trigger a fault, especially for allocations that are backed by a
MemoryManager. Still, if people disable the MemoryManager or the
allocation is big enough, we can sometimes provide valueable feedback.
  • Loading branch information
jdoerfert authored and cjdb committed Aug 23, 2024
1 parent c921444 commit 445e9db
Show file tree
Hide file tree
Showing 11 changed files with 256 additions and 18 deletions.
12 changes: 11 additions & 1 deletion offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3264,8 +3264,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
}
if (DeviceNode != Node)
continue;

void *DevicePtr = (void *)Event->memory_fault.virtual_address;
std::string S;
llvm::raw_string_ostream OS(S);
OS << llvm::format("Memory access fault by GPU %" PRIu32
" (agent 0x%" PRIx64
") at virtual address %p. Reasons: %s",
Node, Event->memory_fault.agent.handle,
(void *)Event->memory_fault.virtual_address,
llvm::join(Reasons, ", ").c_str());
ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
ErrorReporter::reportMemoryAccessError(AMDGPUDevice, DevicePtr, S,
/*Abort*/ true);
}

// Abort the execution since we do not recover from this error.
Expand Down
67 changes: 61 additions & 6 deletions offload/plugins-nextgen/common/include/ErrorReporting.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,13 @@ class ErrorReporter {

if (ATI->HostPtr)
print(BoldLightPurple,
"Last allocation of size %lu for host pointer %p:\n", ATI->Size,
ATI->HostPtr);
"Last allocation of size %lu for host pointer %p -> device pointer "
"%p:\n",
ATI->Size, ATI->HostPtr, ATI->DevicePtr);
else
print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size);
print(BoldLightPurple,
"Last allocation of size %lu -> device pointer %p:\n", ATI->Size,
ATI->DevicePtr);
reportStackTrace(ATI->AllocationTrace);
if (!ATI->LastAllocationInfo)
return;
Expand All @@ -174,10 +177,13 @@ class ErrorReporter {
ATI->Size);
reportStackTrace(ATI->DeallocationTrace);
if (ATI->HostPtr)
print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n",
I, ATI->HostPtr);
print(
BoldLightPurple,
" #%u Prior allocation for host pointer %p -> device pointer %p:\n",
I, ATI->HostPtr, ATI->DevicePtr);
else
print(BoldLightPurple, " #%u Prior allocation:\n", I);
print(BoldLightPurple, " #%u Prior allocation -> device pointer %p:\n",
I, ATI->DevicePtr);
reportStackTrace(ATI->AllocationTrace);
++I;
}
Expand Down Expand Up @@ -219,6 +225,55 @@ class ErrorReporter {
#undef DEALLOCATION_ERROR
}

static void reportMemoryAccessError(GenericDeviceTy &Device, void *DevicePtr,
std::string &ErrorStr, bool Abort) {
reportError(ErrorStr.c_str());

if (!Device.OMPX_TrackAllocationTraces) {
print(Yellow, "Use '%s=true' to track device allocations\n",
Device.OMPX_TrackAllocationTraces.getName().data());
if (Abort)
abortExecution();
return;
}
uintptr_t Distance = false;
auto *ATI =
Device.getClosestAllocationTraceInfoForAddr(DevicePtr, Distance);
if (!ATI) {
print(Cyan,
"No host-issued allocations; device pointer %p might be "
"a global, stack, or shared location\n",
DevicePtr);
if (Abort)
abortExecution();
return;
}
if (!Distance) {
print(Cyan, "Device pointer %p points into%s host-issued allocation:\n",
DevicePtr, ATI->DeallocationTrace.empty() ? "" : " prior");
reportAllocationInfo(ATI);
if (Abort)
abortExecution();
return;
}

bool IsClose = Distance < (1L << 29L /*512MB=*/);
print(Cyan,
"Device pointer %p does not point into any (current or prior) "
"host-issued allocation%s.\n",
DevicePtr,
IsClose ? "" : " (might be a global, stack, or shared location)");
if (IsClose) {
print(Cyan,
"Closest host-issued allocation (distance %" PRIuPTR
" byte%s; might be by page):\n",
Distance, Distance > 1 ? "s" : "");
reportAllocationInfo(ATI);
}
if (Abort)
abortExecution();
}

/// Report that a kernel encountered a trap instruction.
static void reportTrapInKernel(
GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
Expand Down
46 changes: 41 additions & 5 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,42 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// been deallocated, both for error reporting purposes.
ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;

/// Return the allocation trace info for a device pointer, that is the
/// allocation into which this device pointer points to (or pointed into).
AllocationTraceInfoTy *getAllocationTraceInfoForAddr(void *DevicePtr) {
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
for (auto &It : *AllocationTraceMap) {
if (It.first <= DevicePtr &&
advanceVoidPtr(It.first, It.second->Size) > DevicePtr)
return It.second;
}
return nullptr;
}

/// Return the allocation trace info for a device pointer, that is the
/// allocation into which this device pointer points to (or pointed into).
AllocationTraceInfoTy *
getClosestAllocationTraceInfoForAddr(void *DevicePtr, uintptr_t &Distance) {
Distance = 0;
if (auto *ATI = getAllocationTraceInfoForAddr(DevicePtr)) {
return ATI;
}

AllocationTraceInfoTy *ATI = nullptr;
uintptr_t DevicePtrI = uintptr_t(DevicePtr);
auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
for (auto &It : *AllocationTraceMap) {
uintptr_t Begin = uintptr_t(It.second->DevicePtr);
uintptr_t End = Begin + It.second->Size - 1;
uintptr_t ItDistance = std::min(Begin - DevicePtrI, DevicePtrI - End);
if (ATI && ItDistance > Distance)
continue;
ATI = It.second;
Distance = ItDistance;
}
return ATI;
}

/// Map to record kernel have been launchedl, for error reporting purposes.
ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;

Expand All @@ -946,6 +982,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar OMPX_TrackNumKernelLaunches =
UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);

/// Environment variable to determine if stack traces for allocations and
/// deallocations are tracked.
BoolEnvar OMPX_TrackAllocationTraces =
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);

private:
/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
Expand Down Expand Up @@ -996,11 +1037,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar OMPX_InitialNumStreams;
UInt32Envar OMPX_InitialNumEvents;

/// Environment variable to determine if stack traces for allocations and
/// deallocations are tracked.
BoolEnvar OMPX_TrackAllocationTraces =
BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);

/// Array of images loaded into the device. Images are automatically
/// deallocated by the allocator.
llvm::SmallVector<DeviceImageTy *> LoadedImages;
Expand Down
6 changes: 3 additions & 3 deletions offload/test/sanitizer/double_free.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}double_free.c:24
//
// CHECK: Last allocation of size 8:
// CHECK: Last allocation of size 8 -> device pointer
// CHECK: dataAlloc
// CHECK: omp_target_alloc
// NDEBG: main
Expand All @@ -49,7 +49,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}double_free.c:22
//
// CHECK: #0 Prior allocation:
// CHECK: #0 Prior allocation -> device pointer
// CHECK: dataAlloc
// CHECK: omp_target_alloc
// NDEBG: main
Expand All @@ -61,7 +61,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}double_free.c:20
//
// CHECK: #1 Prior allocation:
// CHECK: #1 Prior allocation -> device pointer
// CHECK: dataAlloc
// CHECK: omp_target_alloc
// NDEBG: main
Expand Down
2 changes: 1 addition & 1 deletion offload/test/sanitizer/double_free_racy.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ int main(void) {
// CHECK: dataDelete
// CHECK: omp_target_free

// CHECK: Last allocation of size 8:
// CHECK: Last allocation of size 8 -> device pointer
// CHECK: dataAlloc
// CHECK: omp_target_alloc
2 changes: 1 addition & 1 deletion offload/test/sanitizer/free_wrong_ptr_kind.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}free_wrong_ptr_kind.c:22
//
// CHECK: Last allocation of size 8:
// CHECK: Last allocation of size 8 -> device pointer
// CHECK: dataAlloc
// CHECK: llvm_omp_target_alloc_host
// NDEBG: main
Expand Down
2 changes: 1 addition & 1 deletion offload/test/sanitizer/free_wrong_ptr_kind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ int main(void) {
// NDEBG: main
// DEBUG: main {{.*}}free_wrong_ptr_kind.cpp:25
//
// CHECK: Last allocation of size 8:
// CHECK: Last allocation of size 8 -> device pointer
// CHECK: dataAlloc
// CHECK: llvm_omp_target_alloc_shared
// NDEBG: main
Expand Down
40 changes: 40 additions & 0 deletions offload/test/sanitizer/ptr_outside_alloc_1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
// clang-format on

// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

#include <omp.h>

void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
void llvm_omp_target_free_host(void *Ptr, int DeviceNum);

int main() {
int N = (1 << 30);
char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device());
char *P;
#pragma omp target map(from : P)
{
P = &A[0];
*P = 3;
}
// clang-format off
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations
// TRACE: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation.
// TRACE: Closest host-issued allocation (distance 4096 bytes; might be by page):
// TRACE: Last allocation of size 1073741824
// clang-format on
#pragma omp target
{ P[-4] = 5; }

llvm_omp_target_free_host(A, omp_get_default_device());
}
26 changes: 26 additions & 0 deletions offload/test/sanitizer/ptr_outside_alloc_2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
// clang-format on

// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

#include <omp.h>

int main() {
int N = (1 << 30);
char *A = (char *)malloc(N);
#pragma omp target map(A[ : N])
{ A[N] = 3; }
// clang-format off
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// CHECK: Device pointer [[PTR]] does not point into any (current or prior) host-issued allocation.
// CHECK: Closest host-issued allocation (distance 1 byte; might be by page):
// CHECK: Last allocation of size 1073741824
// clang-format on
}
39 changes: 39 additions & 0 deletions offload/test/sanitizer/use_after_free_1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NTRCE
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
// clang-format on

// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

#include <omp.h>

void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
void llvm_omp_target_free_host(void *Ptr, int DeviceNum);

int main() {
int N = (1 << 30);
char *A = (char *)llvm_omp_target_alloc_host(N, omp_get_default_device());
char *P;
#pragma omp target map(from : P)
{
P = &A[N / 2];
*P = 3;
}
llvm_omp_target_free_host(A, omp_get_default_device());
// clang-format off
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// NTRCE: Use 'OFFLOAD_TRACK_ALLOCATION_TRACES=true' to track device allocations
// TRACE: Device pointer [[PTR]] points into prior host-issued allocation:
// TRACE: Last deallocation:
// TRACE: Last allocation of size 1073741824
// clang-format on
#pragma omp target
{ *P = 5; }
}
32 changes: 32 additions & 0 deletions offload/test/sanitizer/use_after_free_2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// clang-format off
// RUN: %libomptarget-compileopt-generic
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
// clang-format on

// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
// UNSUPPORTED: s390x-ibm-linux-gnu
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO

#include <omp.h>

int main() {
int N = (1 << 30);
char *A = (char *)malloc(N);
char *P;
#pragma omp target map(A[ : N]) map(from : P)
{
P = &A[N / 2];
*P = 3;
}
// clang-format off
// CHECK: OFFLOAD ERROR: Memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
// CHECK: Last deallocation:
// CHECK: Last allocation of size 1073741824
// clang-format on
#pragma omp target
{ *P = 5; }
}

0 comments on commit 445e9db

Please sign in to comment.