[QNN EP] Apply workaround for Conv validation bug when bias input is …

…implicit (#21764) ### Description - Adds a dummy bias of all zeros when translating a Conv without an explicit bias input. This is a workaround for a QNN validation issue that fails when the optional bias input is not provided. - Corrects logic for unpacking of **non-zero int4** zero-points. Bug does not impact models because we currently only support int4 zero-points equal to 0 (symmetric quant). But this would become an issue in the future if/when QNN supports non-zero int4 zero-points (so good to fix now). ### Motivation and Context Support Conv operators without a bias input on QNN EP with the latest QNN SDK.
microsoft · Aug 22, 2024 · 514b469 · 514b469
1 parent 6c1a3f8
commit 514b469
Show file tree

Hide file tree

Showing 8 changed files with 207 additions and 59 deletions.
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -80,6 +80,64 @@ Status BaseOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   return Status::OK();
 }
 
+Status BaseOpBuilder::AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
+                                       const QnnQuantParamsWrapper& input0_qparams,
+                                       const QnnQuantParamsWrapper& input1_qparams,
+                                       std::vector<uint32_t>&& bias_shape,
+                                       const std::string& bias_name,
+                                       const logging::Logger& logger,
+                                       std::vector<std::string>& input_names) const {
+  ORT_UNUSED_PARAMETER(logger);
+  // For now, only handle case where input0 is per-tensor quantized and input1 is either per-tensor
+  // or per-channel quantized.
+  ORT_RETURN_IF_NOT(input0_qparams.IsPerTensor(/*include_bw*/ true) && input1_qparams.IsQuantized(),
+                    "QNN EP currently only supports adding a dummy zero bias input for per-tensor ",
+                    "input[0] and per-tensor/per-channel input[1]");
+
+  size_t num_bias_elems = 1;
+  for (size_t i = 0; i < bias_shape.size(); i++) {
+    num_bias_elems *= static_cast<size_t>(bias_shape[i]);
+  }
+
+  // Bias static input should be all zeros.
+  std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);
+
+  // Bias's quantization scale(s) should be the product of the other inputs' quantization scales.
+  // Input[0] is expected to have one scale (per-tensor).
+  // If input[1] is per-channel (many scales), then the dummy bias also needs to be per-channel.
+  std::vector<float> input0_quant_scales;
+  std::vector<float> input1_quant_scales;
+  ORT_RETURN_IF_ERROR(input0_qparams.GetScales(input0_quant_scales));
+  ORT_RETURN_IF_ERROR(input1_qparams.GetScales(input1_quant_scales));
+
+  const size_t num_bias_scales_offsets = input1_quant_scales.size();
+  assert(input0_quant_scales.size() == 1);  // Expected for per-tensor.
+  ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
+                    "Input[1] should have >= 1 quantization scale values");
+
+  std::vector<float> bias_scales(num_bias_scales_offsets);
+  for (size_t i = 0; i < num_bias_scales_offsets; i++) {
+    bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
+  }
+
+  std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0);  // Bias's zero-points should be all zeros.
+  QnnQuantParamsWrapper bias_qparams;
+
+  if (input1_qparams.IsPerChannel()) {
+    bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
+  } else {
+    bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
+  }
+
+  auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
+                                         std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));
+
+  qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
+  input_names.push_back(bias_name);
+
+  return Status::OK();
+}
+
 Status BaseOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
                                                   const NodeUnit& node_unit,
                                                   std::vector<std::string>&& input_names,

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -95,6 +95,14 @@ class BaseOpBuilder : public IOpBuilder {
                       const logging::Logger& logger,
                       std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;
 
+  Status AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
+                          const QnnQuantParamsWrapper& input0_qparams,
+                          const QnnQuantParamsWrapper& input1_qparams,
+                          std::vector<uint32_t>&& bias_shape,
+                          const std::string& bias_name,
+                          const logging::Logger& logger,
+                          std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;
+
   Status SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper& qnn_model_wrapper,
                                                   const NodeUnit& node_unit,
                                                   const logging::Logger& logger,

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -289,10 +289,30 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
   //
   // Input 2: bias
   //
-  if (num_inputs == 3) {
+  const bool has_bias_input = num_inputs == 3;
+  if (has_bias_input) {
     ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
   }
 
+#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18)
+  if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
+    // Bias is implicit. QNN SDK 2.23/2.24/2.25 (QNN API version 2.16/2.17/2.18) has a validation bug for
+    // implicit bias inputs, so provide an explicit bias of all 0 (quantized int32).
+    TensorInfo input0_info = {};
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));
+
+    TensorInfo input1_info = {};
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));
+
+    if (input0_info.quant_param.IsPerTensor(/*include_bw*/ true) && input1_info.quant_param.IsQuantized()) {
+      const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
+      std::vector<uint32_t> bias_shape = {input1_info.shape[0]};
+      ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, input0_info.quant_param, input1_info.quant_param,
+                                           std::move(bias_shape), bias_name, logger, input_names));
+    }
+  }
+#endif
+
   return Status::OK();
 }
 

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -99,47 +99,9 @@ Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
     if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) {
       const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
-
-      // Make dummy bias input have the same shape as the scale input.
       std::vector<uint32_t> bias_shape = scale_input_info.shape;
-      size_t num_bias_elems = 1;
-      for (size_t i = 0; i < bias_shape.size(); i++) {
-        num_bias_elems *= static_cast<size_t>(bias_shape[i]);
-      }
-
-      // Bias static input should be all zeros.
-      std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);
-
-      // Bias's quantization scale should be the product of the other inputs' quantization scales.
-      std::vector<float> input0_quant_scales;
-      std::vector<float> input1_quant_scales;
-      ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales));
-      ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales));
-
-      const size_t num_bias_scales_offsets = input1_quant_scales.size();
-      assert(input0_quant_scales.size() == 1);  // Expected for per-tensor.
-      ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
-                        "Input[1] should have >= 1 quantization scale values");
-
-      std::vector<float> bias_scales(num_bias_scales_offsets);
-      for (size_t i = 0; i < num_bias_scales_offsets; i++) {
-        bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
-      }
-
-      std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0);  // Bias's zero-points should be all zeros.
-      QnnQuantParamsWrapper bias_qparams;
-
-      if (scale_input_info.quant_param.IsPerChannel()) {
-        bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
-      } else {
-        bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
-      }
-
-      auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
-                                             std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));
-
-      qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
-      input_names.push_back(bias_name);
+      ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, x_input_info.quant_param, scale_input_info.quant_param,
+                                           std::move(bias_shape), bias_name, logger, input_names));
     }
   }
 #endif

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -335,7 +335,18 @@ Status QnnModelWrapper::UnpackZeroPoints(const std::string& initializer_name,
 
   switch (onnx_data_type) {
     // QNN use -offset for some reason
-    case ONNX_NAMESPACE::TensorProto_DataType_INT4:  // INT4 zero-points are unpacked as 8-bit values for QNN
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4: {  // INT4 zero-points are unpacked as 8-bit values for QNN
+      auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
+      std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
+                     [](int8_t masked_zp) -> int32_t {
+                       // We currently unpack int4 as int8 but with the top 4-bits masked off due to QNN bug.
+                       // Need to undo the masking so that the zero-point value is correct.
+                       // (Not really a problem yet because QNN only supports symmetric INT4 quantization with zp == 0).
+                       int8_t zp = Int4x2::SignExtendLower4Bits(std::byte(masked_zp));
+                       return -static_cast<int32_t>(zp);
+                     });
+      break;
+    }
     case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
       auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
       std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -231,6 +231,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
     } else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
       out << " axis=" << quantize_params.axisScaleOffsetEncoding.axis;
       size_t num_elems = quantize_params.axisScaleOffsetEncoding.numScaleOffsets;
+      bool truncate = num_elems > 20;
+      num_elems = truncate ? 20 : num_elems;
       out << " scales=(";
       for (size_t i = 0; i < num_elems; i++) {
         out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].scale << (i == num_elems - 1 ? "" : " ");
@@ -239,11 +241,13 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
       for (size_t i = 0; i < num_elems; i++) {
         out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].offset << (i == num_elems - 1 ? "" : " ");
       }
-      out << ")";
+      out << (truncate ? "...)" : ")");
     } else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
       out << " axis=" << quantize_params.bwAxisScaleOffsetEncoding.axis;
       out << " bw=" << quantize_params.bwAxisScaleOffsetEncoding.bitwidth;
       size_t num_elems = quantize_params.bwAxisScaleOffsetEncoding.numElements;
+      bool truncate = num_elems > 20;
+      num_elems = truncate ? 20 : num_elems;
       out << " scales=(";
       for (size_t i = 0; i < num_elems; i++) {
         out << quantize_params.bwAxisScaleOffsetEncoding.scales[i] << (i == num_elems - 1 ? "" : " ");
@@ -252,7 +256,7 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
       for (size_t i = 0; i < num_elems; i++) {
         out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " ");
       }
-      out << ")";
+      out << (truncate ? "...)" : ")");
     } else {
       out << " encoding not supported.";
     }

diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -423,14 +423,14 @@ static void LogNodeSupport(const logging::Logger& logger,
     return;
   }
 
+  size_t num_nodes = 0;
   std::ostringstream oss;
-  oss << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for nodes ("
-      << qnn_node_group.Type() << "):" << std::endl;
   for (const NodeUnit* node_unit : qnn_node_group.GetNodeUnits()) {
     for (const Node* node : node_unit->GetAllNodesInGroup()) {
       oss << "\tOperator type: " << node->OpType()
           << " Node name: " << node->Name()
           << " Node index: " << node->Index() << std::endl;
+      num_nodes += 1;
     }
   }
   if (!support_status.IsOK()) {
@@ -440,6 +440,9 @@ static void LogNodeSupport(const logging::Logger& logger,
   logging::Capture(logger, log_severity, logging::Category::onnxruntime,
                    log_data_type, call_site)
           .Stream()
+      << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
+      << " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
+      << std::endl
       << oss.str();
 }
 

diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -793,19 +793,101 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) {
   TestInputDef<float> bias_def(bias_shape, true,
                                GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
 
-  RunHTPConvOpPerChannelTest<uint8_t, Int4x2>("Conv",
-                                              input_def,
-                                              weight_def,
-                                              bias_def,
-                                              0,             // weight quant axis
-                                              {1, 1},        // Strides
-                                              {0, 0, 0, 0},  // Pads
-                                              {1, 1},        // Dilations
-                                              1,             // default group
-                                              "NOTSET",
-                                              ExpectedEPNodeAssignment::All,
-                                              false,  // use_qdq_contrib_ops
-                                              21);    // opset
+  RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
+                                               input_def,
+                                               weight_def,
+                                               bias_def,
+                                               0,             // weight quant axis
+                                               {1, 1},        // Strides
+                                               {0, 0, 0, 0},  // Pads
+                                               {1, 1},        // Dilations
+                                               1,             // default group
+                                               "NOTSET",
+                                               ExpectedEPNodeAssignment::All,
+                                               false,  // use_qdq_contrib_ops
+                                               21);    // opset
+}
+
+// Test per-channel QDQ Conv with INT4 weights and no bias.
+// in0: u16, in1 (weight): s4, out: u8
+// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
+TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {3, 2, 2, 2};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
+                                               input_def,
+                                               weight_def,
+                                               TestInputDef<float>(),
+                                               0,             // weight quant axis
+                                               {1, 1},        // Strides
+                                               {0, 0, 0, 0},  // Pads
+                                               {1, 1},        // Dilations
+                                               1,             // default group
+                                               "NOTSET",
+                                               ExpectedEPNodeAssignment::All,
+                                               false,  // use_qdq_contrib_ops
+                                               21);    // opset
+}
+
+// Test per-channel QDQ Conv with uint16 input[0], uint8 weights, and no bias.
+// in0: u16, in1 (weight): s4, out: u8
+// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
+TEST_F(QnnHTPBackendTests, ConvU16U8_PerTensor_NoBias) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {3, 2, 2, 2};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+
+  RunHTPConvOpTest<uint16_t, uint8_t>("Conv",
+                                      input_def,
+                                      weight_def,
+                                      TestInputDef<float>(),
+                                      {1, 1},        // Strides
+                                      {0, 0, 0, 0},  // Pads
+                                      {1, 1},        // Dilations
+                                      1,             // default group
+                                      "NOTSET",
+                                      ExpectedEPNodeAssignment::All,
+                                      false,  // use_qdq_contrib_ops
+                                      21);    // opset
+}
+
+TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias_LargeINT4Weight) {
+  std::vector<int64_t> input_shape = {1, 3072, 1, 512};
+  std::vector<int64_t> weight_shape = {9216, 3072, 1, 1};
+  std::vector<float> input_data(TensorShape(input_shape).Size(), 0.1f);
+  input_data[0] = 0.2f;
+  std::vector<float> weight_data(TensorShape(weight_shape).Size(), -0.1f);
+  for (size_t c = 0; c < static_cast<size_t>(weight_shape[0]); c++) {
+    size_t i = c * 3072;
+    weight_data[i] = 0.1f;
+  }
+
+  TestInputDef<float> input_def(input_shape, false, input_data);
+  TestInputDef<float> weight_def(weight_shape, true, weight_data);
+
+  RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
+                                               input_def,
+                                               weight_def,
+                                               TestInputDef<float>(),
+                                               0,             // weight quant axis
+                                               {1, 1},        // Strides
+                                               {0, 0, 0, 0},  // Pads
+                                               {1, 1},        // Dilations
+                                               1,             // default group
+                                               "NOTSET",
+                                               ExpectedEPNodeAssignment::All,
+                                               false,  // use_qdq_contrib_ops
+                                               21);    // opset
 }
 
 // Test fusion of DQs -> Conv -> Relu/Clip -> Q.