Skip to content

Commit

Permalink
[QNN EP] Apply workaround for Conv validation bug when bias input is …
Browse files Browse the repository at this point in the history
…implicit (#21764)

### Description
- Adds a dummy bias of all zeros when translating a Conv without an
explicit bias input. This is a workaround for a QNN validation issue
that fails when the optional bias input is not provided.
- Corrects logic for unpacking of **non-zero int4** zero-points. Bug
does not impact models because we currently only support int4
zero-points equal to 0 (symmetric quant). But this would become an issue
in the future if/when QNN supports non-zero int4 zero-points (so good to
fix now).



### Motivation and Context
Support Conv operators without a bias input on QNN EP with the latest
QNN SDK.
  • Loading branch information
adrianlizarraga authored Aug 22, 2024
1 parent 6c1a3f8 commit 514b469
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,64 @@ Status BaseOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
return Status::OK();
}

Status BaseOpBuilder::AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& input1_qparams,
std::vector<uint32_t>&& bias_shape,
const std::string& bias_name,
const logging::Logger& logger,
std::vector<std::string>& input_names) const {
ORT_UNUSED_PARAMETER(logger);
// For now, only handle case where input0 is per-tensor quantized and input1 is either per-tensor
// or per-channel quantized.
ORT_RETURN_IF_NOT(input0_qparams.IsPerTensor(/*include_bw*/ true) && input1_qparams.IsQuantized(),
"QNN EP currently only supports adding a dummy zero bias input for per-tensor ",
"input[0] and per-tensor/per-channel input[1]");

size_t num_bias_elems = 1;
for (size_t i = 0; i < bias_shape.size(); i++) {
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
}

// Bias static input should be all zeros.
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);

// Bias's quantization scale(s) should be the product of the other inputs' quantization scales.
// Input[0] is expected to have one scale (per-tensor).
// If input[1] is per-channel (many scales), then the dummy bias also needs to be per-channel.
std::vector<float> input0_quant_scales;
std::vector<float> input1_quant_scales;
ORT_RETURN_IF_ERROR(input0_qparams.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(input1_qparams.GetScales(input1_quant_scales));

const size_t num_bias_scales_offsets = input1_quant_scales.size();
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");

std::vector<float> bias_scales(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
}

std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
QnnQuantParamsWrapper bias_qparams;

if (input1_qparams.IsPerChannel()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}

auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));

qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
input_names.push_back(bias_name);

return Status::OK();
}

Status BaseOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ class BaseOpBuilder : public IOpBuilder {
const logging::Logger& logger,
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;

Status AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper,
const QnnQuantParamsWrapper& input0_qparams,
const QnnQuantParamsWrapper& input1_qparams,
std::vector<uint32_t>&& bias_shape,
const std::string& bias_name,
const logging::Logger& logger,
std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;

Status SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,30 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
//
// Input 2: bias
//
if (num_inputs == 3) {
const bool has_bias_input = num_inputs == 3;
if (has_bias_input) {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names));
}

#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18)
if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
// Bias is implicit. QNN SDK 2.23/2.24/2.25 (QNN API version 2.16/2.17/2.18) has a validation bug for
// implicit bias inputs, so provide an explicit bias of all 0 (quantized int32).
TensorInfo input0_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info));

TensorInfo input1_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info));

if (input0_info.quant_param.IsPerTensor(/*include_bw*/ true) && input1_info.quant_param.IsQuantized()) {
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
std::vector<uint32_t> bias_shape = {input1_info.shape[0]};
ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, input0_info.quant_param, input1_info.quant_param,
std::move(bias_shape), bias_name, logger, input_names));
}
}
#endif

return Status::OK();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,47 +99,9 @@ Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,

if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) {
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";

// Make dummy bias input have the same shape as the scale input.
std::vector<uint32_t> bias_shape = scale_input_info.shape;
size_t num_bias_elems = 1;
for (size_t i = 0; i < bias_shape.size(); i++) {
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
}

// Bias static input should be all zeros.
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);

// Bias's quantization scale should be the product of the other inputs' quantization scales.
std::vector<float> input0_quant_scales;
std::vector<float> input1_quant_scales;
ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales));

const size_t num_bias_scales_offsets = input1_quant_scales.size();
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");

std::vector<float> bias_scales(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
}

std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
QnnQuantParamsWrapper bias_qparams;

if (scale_input_info.quant_param.IsPerChannel()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}

auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));

qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
input_names.push_back(bias_name);
ORT_RETURN_IF_ERROR(AddZeroBiasInput(qnn_model_wrapper, x_input_info.quant_param, scale_input_info.quant_param,
std::move(bias_shape), bias_name, logger, input_names));
}
}
#endif
Expand Down
13 changes: 12 additions & 1 deletion onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,18 @@ Status QnnModelWrapper::UnpackZeroPoints(const std::string& initializer_name,

switch (onnx_data_type) {
// QNN use -offset for some reason
case ONNX_NAMESPACE::TensorProto_DataType_INT4: // INT4 zero-points are unpacked as 8-bit values for QNN
case ONNX_NAMESPACE::TensorProto_DataType_INT4: { // INT4 zero-points are unpacked as 8-bit values for QNN
auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
[](int8_t masked_zp) -> int32_t {
// We currently unpack int4 as int8 but with the top 4-bits masked off due to QNN bug.
// Need to undo the masking so that the zero-point value is correct.
// (Not really a problem yet because QNN only supports symmetric INT4 quantization with zp == 0).
int8_t zp = Int4x2::SignExtendLower4Bits(std::byte(masked_zp));
return -static_cast<int32_t>(zp);
});
break;
}
case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(initializer_bytes));
std::transform(int8_span.begin(), int8_span.end(), std::back_inserter(zero_points),
Expand Down
8 changes: 6 additions & 2 deletions onnxruntime/core/providers/qnn/builder/qnn_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
} else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
out << " axis=" << quantize_params.axisScaleOffsetEncoding.axis;
size_t num_elems = quantize_params.axisScaleOffsetEncoding.numScaleOffsets;
bool truncate = num_elems > 20;
num_elems = truncate ? 20 : num_elems;
out << " scales=(";
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].scale << (i == num_elems - 1 ? "" : " ");
Expand All @@ -239,11 +241,13 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.axisScaleOffsetEncoding.scaleOffset[i].offset << (i == num_elems - 1 ? "" : " ");
}
out << ")";
out << (truncate ? "...)" : ")");
} else if (quantize_params.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
out << " axis=" << quantize_params.bwAxisScaleOffsetEncoding.axis;
out << " bw=" << quantize_params.bwAxisScaleOffsetEncoding.bitwidth;
size_t num_elems = quantize_params.bwAxisScaleOffsetEncoding.numElements;
bool truncate = num_elems > 20;
num_elems = truncate ? 20 : num_elems;
out << " scales=(";
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.bwAxisScaleOffsetEncoding.scales[i] << (i == num_elems - 1 ? "" : " ");
Expand All @@ -252,7 +256,7 @@ std::ostream& operator<<(std::ostream& out, const Qnn_QuantizeParams_t& quantize
for (size_t i = 0; i < num_elems; i++) {
out << quantize_params.bwAxisScaleOffsetEncoding.offsets[i] << (i == num_elems - 1 ? "" : " ");
}
out << ")";
out << (truncate ? "...)" : ")");
} else {
out << " encoding not supported.";
}
Expand Down
7 changes: 5 additions & 2 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -423,14 +423,14 @@ static void LogNodeSupport(const logging::Logger& logger,
return;
}

size_t num_nodes = 0;
std::ostringstream oss;
oss << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for nodes ("
<< qnn_node_group.Type() << "):" << std::endl;
for (const NodeUnit* node_unit : qnn_node_group.GetNodeUnits()) {
for (const Node* node : node_unit->GetAllNodesInGroup()) {
oss << "\tOperator type: " << node->OpType()
<< " Node name: " << node->Name()
<< " Node index: " << node->Index() << std::endl;
num_nodes += 1;
}
}
if (!support_status.IsOK()) {
Expand All @@ -440,6 +440,9 @@ static void LogNodeSupport(const logging::Logger& logger,
logging::Capture(logger, log_severity, logging::Category::onnxruntime,
log_data_type, call_site)
.Stream()
<< (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
<< " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
<< std::endl
<< oss.str();
}

Expand Down
108 changes: 95 additions & 13 deletions onnxruntime/test/providers/qnn/conv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -793,19 +793,101 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) {
TestInputDef<float> bias_def(bias_shape, true,
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));

RunHTPConvOpPerChannelTest<uint8_t, Int4x2>("Conv",
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

// Test per-channel QDQ Conv with INT4 weights and no bias.
// in0: u16, in1 (weight): s4, out: u8
// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};

TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));

RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

// Test per-channel QDQ Conv with uint16 input[0], uint8 weights, and no bias.
// in0: u16, in1 (weight): s4, out: u8
// Tests bug in QNN SDK 2.25 when validating Conv without a bias (QNN EP adds a dummy bias).
TEST_F(QnnHTPBackendTests, ConvU16U8_PerTensor_NoBias) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};

TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));

RunHTPConvOpTest<uint16_t, uint8_t>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

TEST_F(QnnHTPBackendTests, ConvU16S4_PerChannel_NoBias_LargeINT4Weight) {
std::vector<int64_t> input_shape = {1, 3072, 1, 512};
std::vector<int64_t> weight_shape = {9216, 3072, 1, 1};
std::vector<float> input_data(TensorShape(input_shape).Size(), 0.1f);
input_data[0] = 0.2f;
std::vector<float> weight_data(TensorShape(weight_shape).Size(), -0.1f);
for (size_t c = 0; c < static_cast<size_t>(weight_shape[0]); c++) {
size_t i = c * 3072;
weight_data[i] = 0.1f;
}

TestInputDef<float> input_def(input_shape, false, input_data);
TestInputDef<float> weight_def(weight_shape, true, weight_data);

RunHTPConvOpPerChannelTest<uint16_t, Int4x2>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

// Test fusion of DQs -> Conv -> Relu/Clip -> Q.
Expand Down

0 comments on commit 514b469

Please sign in to comment.