Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDA/ROCm/Migraphx] consolidate gpu data transfer #22609

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion onnxruntime/core/providers/cuda/cuda_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class Memcpy final : public OpKernel {
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
Tensor* Y = ctx->Output(0, X->Shape());
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
// do we support async copy?
// The cudaMemCpyAsync will handle the pinned memory and non-pinned memory,
// so we don't need the check here.
auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
Expand Down
20 changes: 12 additions & 8 deletions onnxruntime/core/providers/cuda/gpu_data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
#include "cuda_common.h"

namespace onnxruntime {
GPUDataTransfer::GPUDataTransfer() {}

GPUDataTransfer::~GPUDataTransfer() {}

bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED ||
dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
Expand All @@ -30,19 +26,25 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
// Copy only if the two addresses are different.
if (dst_data != src_data) {
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice));
// For device memory to device memory copy, no host-side synchronization is performed by cudaMemcpy.
// see https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
}
} else {
// copy from other CPU memory to GPU, this is blocking
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
if (src_device.MemType() != OrtDevice::MemType::CUDA_PINNED) {
// For cudaMemcpy from pageable host memory to device memory, DMA to final destination may not have completed.
// see https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
}
}
} else if (src_device.Type() == OrtDevice::GPU) {
// copying from GPU to CPU memory, this is blocking
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
} else {
// copying between cpu memory
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand All @@ -59,7 +61,7 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,

if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU) {
// copy from pinned memory to GPU, this is non-blocking
// copy from pinned or non-pinned CPU memory to GPU
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, static_cast<cudaStream_t>(stream.GetHandle())));
} else if (src_device.Type() == OrtDevice::GPU) {
// copying between GPU, this is non-blocking
Expand All @@ -69,14 +71,16 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
}
} else if (src_device.Type() == OrtDevice::GPU) {
if (dst_device.Type() == OrtDevice::CPU) {
// copying from GPU to pinned memory, this is non-blocking
// copy from GPU to pinned or non-pinned CPU memory.
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, static_cast<cudaStream_t>(stream.GetHandle())));
}
} else {
if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// sync the stream first to make sure the data arrived
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(stream.GetHandle())));
}

ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/cuda/gpu_data_transfer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ namespace onnxruntime {

class GPUDataTransfer : public IDataTransfer {
public:
GPUDataTransfer();
~GPUDataTransfer();
GPUDataTransfer() = default;
~GPUDataTransfer() = default;

bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;

Expand Down
44 changes: 30 additions & 14 deletions onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
// Licensed under the MIT License.

#include "core/providers/shared_library/provider_api.h"
#include "gpu_data_transfer.h"
#include "migraphx_call.h"
#include "core/providers/migraphx/gpu_data_transfer.h"
#include "core/providers/migraphx/migraphx_call.h"

// If you make change below, please also update onnxruntime/core/providers/rocm/gpu_data_transfer.cc

namespace onnxruntime {

bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
}

common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
Expand All @@ -23,17 +27,24 @@
if (src_device.Type() == OrtDevice::GPU) {
// Copy only if the two addresses are different.
if (dst_data != src_data) {
HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
// Follow core/providers/cuda/gpu_data_transfer.cc to synchronize the default stream here.
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
}
} else {
// copy from other CPU memory to GPU, this is blocking
HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
if (src_device.MemType() != OrtDevice::MemType::HIP_PINNED) {
// Follow core/providers/cuda/gpu_data_transfer.cc to synchronize the default stream here.
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
}
}
} else if (src_device.Type() == OrtDevice::GPU) {
// copying from GPU to CPU memory, this is blocking
HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
} else {
// copying between cpu memory
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand All @@ -49,23 +60,28 @@
auto& dst_device = dst.Location().device;

if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
// copy from pinned memory to GPU, this is non-blocking
HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
if (src_device.Type() == OrtDevice::CPU) {
// If source are not pinned, the memory copy will be performed synchronously.
// For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));

Check warning on line 66 in onnxruntime/core/providers/migraphx/gpu_data_transfer.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/migraphx/gpu_data_transfer.cc:66: Lines should be <= 120 characters long [whitespace/line_length] [2]
} else if (src_device.Type() == OrtDevice::GPU) {
// copying between GPU, this is non-blocking
HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, static_cast<hipStream_t>(stream.GetHandle())));
} else {
// copy from other CPU memory to GPU, this is blocking
HIP_CALL_THROW(hipMemcpyWithStream(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
}
} else if (src_device.Type() == OrtDevice::GPU) {
HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
// If dest are not pinned, the memory copy will be performed synchronously.
// For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));

Check warning on line 74 in onnxruntime/core/providers/migraphx/gpu_data_transfer.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/migraphx/gpu_data_transfer.cc:74: Lines should be <= 120 characters long [whitespace/line_length] [2]
} else {
// copying between cpu memory
if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// sync the stream first to make sure the data arrived
HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
}
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

return Status::OK();
}

} // namespace onnxruntime
2 changes: 2 additions & 0 deletions onnxruntime/core/providers/migraphx/migraphx_call.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#pragma once
#include "migraphx_inc.h"
#include "core/common/common.h"

namespace onnxruntime {

Expand All @@ -16,5 +17,6 @@ std::conditional_t<THRW, void, Status> RocmCall(

#define HIP_CALL(expr) (RocmCall<hipError_t, false>((expr), #expr, "HIP", hipSuccess, "", __FILE__, __LINE__))
#define HIP_CALL_THROW(expr) (RocmCall<hipError_t, true>((expr), #expr, "HIP", hipSuccess, "", __FILE__, __LINE__))
#define HIP_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(HIP_CALL(expr))

} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class Memcpy final : public OpKernel {
const IDataTransfer* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
if (!gpu_data_transfer)
return Status(common::ONNXRUNTIME, common::EP_FAIL, "gpu data transfer is missing in Migraphx EP.");
// CopyTensorAsync could handle both pinned memory and non-pinned CPU memory.
// For non-pinned CPU memory, the copy is synchronous.
return gpu_data_transfer->CopyTensorAsync(*X, *Y, *(ctx->GetComputeStream()));
}
};
Expand Down
2 changes: 0 additions & 2 deletions onnxruntime/core/providers/migraphx/migraphx_stream_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
#include "migraphx_inc.h"
#include "migraphx_call.h"

#define HIP_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(HIP_CALL(expr))

namespace onnxruntime {
void WaitMIGraphXNotificationOnDevice(Stream& stream, synchronize::Notification& notification);

Expand Down
23 changes: 13 additions & 10 deletions onnxruntime/core/providers/rocm/gpu_data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
#include "core/providers/rocm/gpu_data_transfer.h"
#include "core/providers/rocm/rocm_common.h"

// If you make change below, please also update onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
namespace onnxruntime {
GPUDataTransfer::GPUDataTransfer() {}

GPUDataTransfer::~GPUDataTransfer() {}

bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
Expand All @@ -30,19 +28,23 @@
// Copy only if the two addresses are different.
if (dst_data != src_data) {
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
// Follow core/providers/cuda/gpu_data_transfer.cc to synchronize the default stream here.
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
}
} else {
// copy from other CPU memory to GPU, this is blocking
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
if (src_device.MemType() != OrtDevice::MemType::HIP_PINNED) {
// Follow core/providers/cuda/gpu_data_transfer.cc to synchronize the default stream here.
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
}
}
} else if (src_device.Type() == OrtDevice::GPU) {
// copying from GPU to CPU memory, this is blocking
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
} else {
// copying between cpu memory
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand All @@ -59,7 +61,8 @@

if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU) {
// copy from pinned memory to GPU, this is non-blocking
// If source are not pinned, the memory copy will be performed synchronously.
// For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
} else if (src_device.Type() == OrtDevice::GPU) {
// copying between GPU, this is non-blocking
Expand All @@ -68,15 +71,15 @@
}
}
} else if (src_device.Type() == OrtDevice::GPU) {
if (dst_device.Type() == OrtDevice::CPU) {
// copying from GPU to pinned memory, this is non-blocking
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
}
// If dest are not pinned, the memory copy will be performed synchronously.
// For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));

Check warning on line 76 in onnxruntime/core/providers/rocm/gpu_data_transfer.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/rocm/gpu_data_transfer.cc:76: Lines should be <= 120 characters long [whitespace/line_length] [2]
} else {
if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// sync the stream first to make sure the data arrived
HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
}
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/rocm/gpu_data_transfer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ namespace onnxruntime {

class GPUDataTransfer : public IDataTransfer {
public:
GPUDataTransfer();
~GPUDataTransfer();
GPUDataTransfer() = default;
~GPUDataTransfer() = default;

bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;

Expand Down
5 changes: 2 additions & 3 deletions onnxruntime/core/providers/rocm/rocm_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,9 @@ class Memcpy final : public OpKernel {
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
Tensor* Y = ctx->Output(0, X->Shape());
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
// do we support async copy?
// The rocmMemCpyAsync will handle the pinned memory and non-pinned memory,
// so we don't need the check here.
auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
// CopyTensorAsync could handle both pinned memory and non-pinned CPU memory.
// For non-pinned CPU memory, the copy is synchronous.
ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream()));
return Status::OK();
} else {
Expand Down
Loading