[GPU] Add blocked layout support to ExperimentalDetectronTopKRois ope…

…ration (#12456) * Addapt ExperimentalDetectronTopKRois to support blocked layouts * Reformat arg_max_gpu_test * Add support and tests to arg_max_min 2D case to support for ExperimentalDetectronTopKROIs operation
openvinotoolkit · Nov 3, 2022 · a9a064b · a9a064b
1 parent fdee4ac
commit a9a064b
Show file tree

Hide file tree

Showing 9 changed files with 552 additions and 1,454 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp
@@ -112,18 +112,19 @@ struct arg_max_min_impl : typed_primitive_impl_ocl<arg_max_min> {
 
 namespace detail {
 attach_arg_max_min_impl::attach_arg_max_min_impl() {
-    implementation_map<arg_max_min>::add(impl_types::ocl, arg_max_min_impl::create,  {
-        std::make_tuple(data_types::f32, format::bfyx),
-        std::make_tuple(data_types::f16, format::bfyx),
-        std::make_tuple(data_types::i32, format::bfyx),
-        std::make_tuple(data_types::i8, format::bfyx),
-        std::make_tuple(data_types::f32, format::bfzyx),
-        std::make_tuple(data_types::f16, format::bfzyx),
-        std::make_tuple(data_types::i8, format::bfzyx),
-        std::make_tuple(data_types::f32, format::yxfb),
-        std::make_tuple(data_types::f16, format::yxfb),
-        std::make_tuple(data_types::i8, format::yxfb),
-    });
+    auto types = {data_types::f16, data_types::f32, data_types::i8, data_types::i32};
+
+    auto formats = {format::bfyx,
+                    format::yxfb,
+                    format::b_fs_yx_fsv16,
+                    format::b_fs_yx_fsv32,
+                    format::bs_fs_yx_bsv16_fsv16,
+                    format::bs_fs_yx_bsv32_fsv16,
+                    format::bs_fs_yx_bsv32_fsv32,
+
+                    format::bfzyx};
+
+    implementation_map<arg_max_min>::add(impl_types::ocl, arg_max_min_impl::create, types, formats);
 }
 }  // namespace detail
 }  // namespace ocl

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_topk_rois.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_topk_rois.cpp
@@ -41,11 +41,17 @@ struct experimental_detectron_topk_rois_impl : typed_primitive_impl_ocl<experime
 namespace detail {
 
 attach_experimental_detectron_topk_rois_impl::attach_experimental_detectron_topk_rois_impl() {
+    auto types = {data_types::f16, data_types::f32};
+    auto formats = {format::bfyx,
+                    format::b_fs_yx_fsv16,
+                    format::b_fs_yx_fsv32,
+                    format::bs_fs_yx_bsv16_fsv16,
+                    format::bs_fs_yx_bsv32_fsv16,
+                    format::bs_fs_yx_bsv32_fsv32};
     implementation_map<experimental_detectron_topk_rois>::add(impl_types::ocl,
-                                                              experimental_detectron_topk_rois_impl::create, {
-                                                                      std::make_tuple(data_types::f16, format::bfyx),
-                                                                      std::make_tuple(data_types::f32, format::bfyx)
-                                                              });
+                                                              experimental_detectron_topk_rois_impl::create,
+                                                              types,
+                                                              formats);
 }
 
 }  // namespace detail

diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -1456,7 +1456,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::tile::type_id() &&
             prim.type() != cldnn::scatter_elements_update::type_id() &&
             prim.type() != cldnn::gather_tree::type_id() &&
-            prim.type() != cldnn::experimental_detectron_detection_output::type_id()) {
+            prim.type() != cldnn::experimental_detectron_detection_output::type_id() &&
+            prim.type() != cldnn::experimental_detectron_topk_rois::type_id()) {
             can_use_fsv16 = false;
         }
 
@@ -1503,7 +1504,9 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::scatter_elements_update::type_id() &&
             prim.type() != cldnn::gather_tree::type_id() &&
             prim.type() != cldnn::experimental_detectron_detection_output::type_id() &&
-            prim.type() != cldnn::deconvolution::type_id()) {
+            prim.type() != cldnn::deconvolution::type_id() &&
+            prim.type() != cldnn::arg_max_min::type_id() &&
+            prim.type() != cldnn::experimental_detectron_topk_rois::type_id()) {
             can_use_bs_fs_yx_bsv16_fsv16 = false;
         }
     }

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_topk_rois_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_topk_rois_ref.cl
@@ -9,11 +9,9 @@ KERNEL(experimental_detectron_topk_rois_ref)(const __global INPUT0_TYPE* input_r
         const __global INPUT1_TYPE* topk_indices, __global OUTPUT_TYPE* output_rois)
 {
     const uint b = get_global_id(0);
-    const uint output_idx = OUTPUT_GET_INDEX(b, 0, 0, 0);
-    const uint roi_idx = topk_indices[b];
-    const uint input_idx = INPUT0_GET_INDEX(roi_idx, 0, 0, 0);
-    output_rois[output_idx] = input_rois[input_idx];
-    output_rois[output_idx + 1] = input_rois[input_idx + 1];
-    output_rois[output_idx + 2] = input_rois[input_idx + 2];
-    output_rois[output_idx + 3] = input_rois[input_idx + 3];
+    const uint roi_idx = topk_indices[INPUT1_GET_INDEX(b, 0, 0, 0)];
+    output_rois[OUTPUT_GET_INDEX(b, 0, 0, 0)] = input_rois[INPUT0_GET_INDEX(roi_idx, 0, 0, 0)];
+    output_rois[OUTPUT_GET_INDEX(b, 1, 0, 0)] = input_rois[INPUT0_GET_INDEX(roi_idx, 1, 0, 0)];
+    output_rois[OUTPUT_GET_INDEX(b, 2, 0, 0)] = input_rois[INPUT0_GET_INDEX(roi_idx, 2, 0, 0)];
+    output_rois[OUTPUT_GET_INDEX(b, 3, 0, 0)] = input_rois[INPUT0_GET_INDEX(roi_idx, 3, 0, 0)];
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_axis.cpp
@@ -39,11 +39,21 @@ ParamsKey ArgMaxMinKernelAxis::GetSupportedKey() const {
     k.EnableInputDataType(Datatype::INT32);
     k.EnableAllOutputDataType();
     k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
+    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
+    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableOutputLayout(DataLayout::bfyx);
     k.EnableInputLayout(DataLayout::yxfb);
     k.EnableOutputLayout(DataLayout::yxfb);
     k.EnableInputLayout(DataLayout::bfzyx);
     k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableArgMaxMinAxis(ArgMaxMinAxis::BATCH);
     k.EnableArgMaxMinAxis(ArgMaxMinAxis::X);
     k.EnableArgMaxMinAxis(ArgMaxMinAxis::Y);

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_gpu_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/arg_max_min/arg_max_min_kernel_gpu_ref.cpp
@@ -17,6 +17,7 @@ ParamsKey ArgMaxMinKernelGPURef::GetSupportedKey() const {
     k.EnableOutputLayout(DataLayout::yxfb);
     k.EnableDifferentTypes();
     k.EnableBatching();
+    k.EnableTensorPitches();
     return k;
 }
 

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_topkroi/topk_rois_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_topkroi/topk_rois_ref.cpp
@@ -65,9 +65,20 @@ ParamsKey ExperimentalDetectronTopKROIRef::GetSupportedKey() const {
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
     k.EnableDifferentTypes();
-    k.EnableOutputLayout(Tensor::bfyx);
     k.EnableInputLayout(Tensor::bfyx);
+    k.EnableInputLayout(Tensor::b_fs_yx_fsv16);
+    k.EnableInputLayout(Tensor::b_fs_yx_fsv32);
+    k.EnableInputLayout(Tensor::bs_fs_yx_bsv16_fsv16);
+    k.EnableInputLayout(Tensor::bs_fs_yx_bsv32_fsv32);
+    k.EnableInputLayout(Tensor::bs_fs_yx_bsv32_fsv16);
+    k.EnableOutputLayout(Tensor::bfyx);
+    k.EnableOutputLayout(Tensor::b_fs_yx_fsv16);
+    k.EnableOutputLayout(Tensor::b_fs_yx_fsv32);
+    k.EnableOutputLayout(Tensor::bs_fs_yx_bsv16_fsv16);
+    k.EnableOutputLayout(Tensor::bs_fs_yx_bsv32_fsv32);
+    k.EnableOutputLayout(Tensor::bs_fs_yx_bsv32_fsv16);
     k.EnableBatching();
+    k.EnableTensorPitches();
     return k;
 }