Skip to content

Commit

Permalink
[GPU] Add NV12 -> Grayscale mode support (#12988)
Browse files Browse the repository at this point in the history
* [GPU] Add NV12 -> Grayscale mode support

* Fix uv plane shape
  • Loading branch information
sshlyapn committed Sep 9, 2022
1 parent 0f5a45c commit af29d22
Show file tree
Hide file tree
Showing 10 changed files with 131 additions and 7 deletions.
10 changes: 8 additions & 2 deletions src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,14 @@ struct reorder : public primitive_base<reorder> {
const layout& output_layout,
const std::vector<float>& values_to_subtract = {},
const reorder_mean_mode mode = reorder_mean_mode::subtract,
const bool nv12_to_grayscale = false,
const primitive_id& ext_prim_id = "")
: primitive_base(id, { input, input2 }, ext_prim_id, output_layout.data_padding, optional_data_type { output_layout.data_type }),
output_format(output_layout.format),
mean(""),
subtract_per_feature(values_to_subtract),
mean_mode(mode) {}
mean_mode(mode),
nv12_to_grayscale(nv12_to_grayscale) {}

/// @brief Constructs reorder primitive with two inputs, which takes mean subtract values from another primitive.
/// @param id This primitive id.
Expand All @@ -134,11 +136,13 @@ struct reorder : public primitive_base<reorder> {
const layout& output_layout,
primitive_id const& mean,
const reorder_mean_mode mode = reorder_mean_mode::subtract,
const bool nv12_to_grayscale = false,
const primitive_id& ext_prim_id = "")
: primitive_base(id, { input, input2 }, ext_prim_id, output_layout.data_padding, optional_data_type{ output_layout.data_type }),
output_format(output_layout.format),
mean(mean),
mean_mode(mode) {}
mean_mode(mode),
nv12_to_grayscale(nv12_to_grayscale) {}

/// @brief Requested memory format.
format output_format;
Expand All @@ -148,6 +152,8 @@ struct reorder : public primitive_base<reorder> {
std::vector<float> subtract_per_feature;
/// @brief Mode of mean execution
reorder_mean_mode mean_mode;
/// @brief Mode of nv12 format reordering
bool nv12_to_grayscale = false;

protected:
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ struct reorder_impl : typed_primitive_impl_ocl<reorder> {

reorder_params.winograd = input_layout.format.is_winograd() || output_layout.format.is_winograd();

reorder_params.nv12_to_grayscale = arg.get_primitive()->nv12_to_grayscale;

auto& kernel_selector = kernel_selector::reorder_kernel_selector::Instance();
auto best_kernels = kernel_selector.GetBestKernels(reorder_params, reorder_optional_params);

Expand Down
6 changes: 5 additions & 1 deletion src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1798,7 +1798,11 @@ format layout_optimizer::get_preferred_format(program_node& node) {
expected = format::get_default_format(layout.get_rank(), false, false);
}
} else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
expected = node.get_output_layout().format;
// Mark as nv12 reorder as this is the only case where two inputs exist
if (node.get_primitive()->input.size() == 2)
expected = format::nv12;
else
expected = node.get_output_layout().format;
} else if (node.is_type<reshape>()) {
if (node.get_output_layout().format.dimension() == 6) {
expected = format::bfwzyx;
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_gpu/src/graph/reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ layout reorder_inst::calc_output_layout(reorder_node const& node) {
}

if (ifmt.is_nv12()) {
auto data_size = tensor{ input_layout.batch(), input_layout.feature() * 3,
auto output_feature = node.get_primitive()->nv12_to_grayscale ? 1 : input_layout.feature() * 3;
auto data_size = tensor{ input_layout.batch(), output_feature,
input_layout.spatial(0), input_layout.spatial(1) };
if (ofmt != ifmt)
return layout(odt, ofmt, data_size, op);
Expand Down Expand Up @@ -177,6 +178,7 @@ std::string reorder_inst::to_string(reorder_node const& node) {
json_composite reorder_info;
reorder_info.add("input id", input.id());
reorder_info.add("mean", mean);
reorder_info.add("nv12_to_grayscale", desc->nv12_to_grayscale);
if (desc->subtract_per_feature.size() > 0) {
reorder_info.add("subtract per feature", desc->subtract_per_feature);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ ParamsKey reorder_biplanar_nv12::GetSupportedKey() const {
JitConstants reorder_biplanar_nv12::GetJitConstants(const reorder_params& params) const {
auto jit = ReorderKernelBase::GetJitConstants(params);
jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0]));

if (params.nv12_to_grayscale)
jit.AddConstant(MakeJitConstant("GRAYSCALE_OUTPUT", 1));

return jit;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ struct reorder_params : public base_params {
uint32_t winograd_nr_tiles_x;
bool winograd = false;
bool has_padded_output = false;
bool nv12_to_grayscale = false;

ParamsKey GetParamsKey() const override {
auto k = base_params::GetParamsKey();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ KERNEL(reorder_biplanar_nv12)(
#endif

float4 Y = read_imagef(input, (int2)(x, y));
#if defined GRAYSCALE_OUTPUT
float gray = Y.x;

uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(gray), NL_M, NL_N);
#else
float4 UV = read_imagef(input_uv, (int2)(x / 2, y / 2));

float Ycomponent = mad(Y.x, 296.82f, -18.624f);
Expand Down Expand Up @@ -105,6 +112,6 @@ KERNEL(reorder_biplanar_nv12)(
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(B), NL_M, NL_N);

#endif

}
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/src/plugin/ops/parameter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
}
int height = inputDims[2];
int width = inputDims[3];
bool grayscale_mode = inputDims[1] == 1;
std::vector<cldnn::primitive_id> reorders;

for (auto i = 0; i < inputDims[0]; i++) {
Expand Down Expand Up @@ -267,6 +268,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
networkInputLayout,
meanValues,
cldnn::reorder_mean_mode::subtract,
grayscale_mode,
inputInfo->name()));
break;
}
Expand All @@ -277,6 +279,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
networkInputLayout,
meanBlobID,
cldnn::reorder_mean_mode::subtract,
grayscale_mode,
inputInfo->name()));
break;
}
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_gpu/tests/test_cases/cl_mem_input_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ TEST(cl_mem_check, check_2_inputs) {
cldnn::mem_lock<float> output_ptr(output_prim, get_test_stream());
int size = width * height * 3;
for (auto i = 0; i < size; i++) {
EXPECT_NEAR(reference_results[i], output_ptr[i], 1.001f);
ASSERT_NEAR(reference_results[i], output_ptr[i], 1.001f);
}
checkStatus(clReleaseMemObject(nv12_image_plane_uv), "clReleaseMemObject");
checkStatus(clReleaseMemObject(nv12_image_plane_y), "clReleaseMemObject");
Expand Down Expand Up @@ -240,7 +240,7 @@ TEST(cl_mem_check, check_input) {
cldnn::mem_lock<float> output_ptr(output_prim, get_test_stream());
int size = width * height * 3;
for (auto i = 0; i < size; i++) {
EXPECT_NEAR(reference_results[i], output_ptr[i], 1.001f);
ASSERT_NEAR(reference_results[i], output_ptr[i], 1.001f);
}
checkStatus(clReleaseMemObject(img), "clReleaseMemObject");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,101 @@ TEST_P(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) {
}
}

TEST_P(RemoteBlob_Test, NV12toGrayscale) {
#if defined(ANDROID)
GTEST_SKIP();
#endif
const int num_batch = 1;
const int num_channels = 1;
const int height = 8;
const int width = 8;

const InferenceEngine::TensorDesc y_plane_desc(InferenceEngine::Precision::U8, {1, 1, height, width},
InferenceEngine::Layout::NCHW);
const InferenceEngine::TensorDesc uv_plane_desc(InferenceEngine::Precision::U8, {1, 2, height / 2, width / 2},
InferenceEngine::Layout::NCHW);

auto fn_ptr_remote = ngraph::builder::subgraph::makeConvertTranspose({num_batch, num_channels, height, width});

CNNNetwork net_remote(fn_ptr_remote);

net_remote.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
net_remote.getInputsInfo().begin()->second->setPrecision(Precision::U8);
net_remote.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12);
auto fake_image_data_y = FuncTestUtils::createAndFillBlob(y_plane_desc, 50, 0, 1);
auto fake_image_data_uv = FuncTestUtils::createAndFillBlob(uv_plane_desc, 256, 0, 1);

auto ie = InferenceEngine::Core();
auto exec_net = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU,
{ { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES} });

// inference using remote blob
auto inf_req = exec_net.CreateInferRequest();
auto cldnn_context = exec_net.GetContext();
cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
auto ocl_instance = std::make_shared<OpenCL>(ctx);
cl_int err;

cl_image_format image_format;
cl_image_desc image_desc = { 0 };
image_format.image_channel_order = CL_R;
image_format.image_channel_data_type = CL_UNORM_INT8;
image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
image_desc.image_width = width;
image_desc.image_height = height;
cl_mem nv12_image_plane_y = clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err);
ASSERT_EQ(err, 0);

image_format.image_channel_order = CL_RG;
image_desc.image_width = width / 2;
image_desc.image_height = height / 2;
cl_mem nv12_image_plane_uv = clCreateImage(ocl_instance->_context.get(), CL_MEM_READ_WRITE, &image_format, &image_desc, NULL, &err);
ASSERT_EQ(err, 0);

size_t origin[3] = { 0, 0, 0 };
size_t y_region[3] = { (size_t)width, (size_t)height, 1 };
size_t uv_region[3] = { (size_t)width / 2, (size_t)height / 2, 1 };

err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_y,
true, origin, y_region, 0, 0, fake_image_data_y->buffer(), 0, NULL, NULL);
ASSERT_EQ(err, 0);

err = clEnqueueWriteImage(ocl_instance->_queue.get(), nv12_image_plane_uv,
true, origin, uv_region, 0, 0, fake_image_data_uv->buffer(), 0, NULL, NULL);
ASSERT_EQ(err, 0);

cl::Image2D img_y = cl::Image2D(nv12_image_plane_y);
cl::Image2D img_uv = cl::Image2D(nv12_image_plane_uv);

auto nv12_blob = make_shared_blob_nv12(cldnn_context, img_y, img_uv);

inf_req.SetBlob(net_remote.getInputsInfo().begin()->first, nv12_blob);
inf_req.Infer();
auto outputBlob_shared = inf_req.GetBlob(net_remote.getOutputsInfo().begin()->first);

// regular inference
CNNNetwork net_regular(fn_ptr_remote);
net_regular.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
net_regular.getInputsInfo().begin()->second->setPrecision(Precision::FP32);
auto exec_net_regular = ie.LoadNetwork(net_regular, deviceName);
auto inf_req_regular = exec_net_regular.CreateInferRequest();

auto fake_image_data = FuncTestUtils::createAndFillBlob(net_regular.getInputsInfo().begin()->second->getTensorDesc());
for (size_t i = 0; i < fake_image_data_y->size(); i++) {
uint8_t data = fake_image_data_y->buffer().as<uint8_t*>()[i];
fake_image_data->buffer().as<float*>()[i] = static_cast<float>(data) / 255;
}
inf_req_regular.SetBlob(net_regular.getInputsInfo().begin()->first, fake_image_data);

inf_req_regular.Infer();
auto outputBlob_regular = inf_req_regular.GetBlob(net_regular.getOutputsInfo().begin()->first);
{
ASSERT_EQ(net_regular.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
}
}

TEST_P(RemoteBlob_Test, smoke_canInferOnUserContext) {
CNNNetwork net(fn_ptr);
Expand Down

0 comments on commit af29d22

Please sign in to comment.