From d2a5ee2e5e5dce69cdfef2ee1a9e78bd83744f71 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Fri, 18 Oct 2024 11:16:20 -0700 Subject: [PATCH] Update the python wrapper script to support weight sharing case (#22341) Update the python wrapper script to support weight sharing case ### Description update the script to support json file that from QNN converter or the one extracted from QNN context binary file for the weight sharing scenario --- .../tools/qnn/gen_qnn_ctx_onnx_model.py | 383 ++++++++++++------ 1 file changed, 264 insertions(+), 119 deletions(-) diff --git a/onnxruntime/python/tools/qnn/gen_qnn_ctx_onnx_model.py b/onnxruntime/python/tools/qnn/gen_qnn_ctx_onnx_model.py index 1bc22eb0e5713..b7d32fd6b2353 100644 --- a/onnxruntime/python/tools/qnn/gen_qnn_ctx_onnx_model.py +++ b/onnxruntime/python/tools/qnn/gen_qnn_ctx_onnx_model.py @@ -20,135 +20,158 @@ def __init__(self): self.dim = [] -def is_quantized_data_type(qnn_data_type): - # QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_FIXED_POINT_16 - return qnn_data_type == 0x0408 or qnn_data_type == 0x0416 or qnn_data_type == 0x0308 or qnn_data_type == 0x0316 - - -def qnn_data_type_to_onnx_data_type(qnn_data_type): - # QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UINT_8 - if qnn_data_type == 0x0408 or qnn_data_type == 0x0108: - return TensorProto.UINT8 - # QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_UINT_16 - elif qnn_data_type == 0x0416 or qnn_data_type == 0x0116: - return TensorProto.UINT16 - # QNN_DATATYPE_UFIXED_POINT_32 QNN_DATATYPE_UINT_32 - elif qnn_data_type == 0x0432 or qnn_data_type == 0x0132: - return TensorProto.UINT32 - # QNN_DATATYPE_UINT_64 - elif qnn_data_type == 0x0164: - return TensorProto.UINT64 - # QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_INT_8 - elif qnn_data_type == 0x0308 or qnn_data_type == 0x0008: - return TensorProto.INT8 - # QNN_DATATYPE_FIXED_POINT_16 QNN_DATATYPE_INT_16 - elif qnn_data_type == 0x0316 or qnn_data_type == 0x0016: - return TensorProto.INT16 - # QNN_DATATYPE_FIXED_POINT_32 QNN_DATATYPE_INT_32 - elif qnn_data_type == 0x0332 or qnn_data_type == 0x0032: - return TensorProto.INT32 - # QNN_DATATYPE_INT_64 - elif qnn_data_type == 0x0064: - return TensorProto.INT64 - # QNN_DATATYPE_FLOAT_16 - elif qnn_data_type == 0x0216: - return TensorProto.FLOAT16 - # QNN_DATATYPE_FLOAT_32 - elif qnn_data_type == 0x0232: - return TensorProto.FLOAT - # QNN_DATATYPE_BOOL_8 - elif qnn_data_type == 0x0508: - return TensorProto.BOOL +def is_quantized_data_type(qnn_data_type, is_converter_json): + if is_converter_json: + # QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_FIXED_POINT_16 + return qnn_data_type == 0x0408 or qnn_data_type == 0x0416 or qnn_data_type == 0x0308 or qnn_data_type == 0x0316 else: - return TensorProto.UNDEFINED - - -def parse_qnn_json_file(qnn_json_file_path, qnn_input_tensor_dic, qnn_output_tensor_dic): - with open(qnn_json_file_path) as qnn_json_file: - qnn_json = json.load(qnn_json_file) - assert "graph" in qnn_json, "QNN converted json file not valid. Can't find graph." - assert "tensors" in qnn_json["graph"], "QNN converted json file not valid. Can't find tensors." - for qnn_tensor_name, qnn_tensor_attribute in qnn_json["graph"]["tensors"].items(): - # type:0 - QNN input tensor, type:1 - QNN output tensor - assert ( - "type" in qnn_tensor_attribute - and "data_type" in qnn_tensor_attribute - and "dims" in qnn_tensor_attribute - ), "QNN converted json file not valid. Can't find some keys from tensors" - - # Get all graph inputs - if qnn_tensor_attribute["type"] == 0: - qnn_tensor = QnnTensorStruct() - qnn_tensor.name = qnn_tensor_name - qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(qnn_tensor_attribute["data_type"]) - qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"]) - qnn_tensor.dim = qnn_tensor_attribute["dims"] - if ( - qnn_tensor_attribute["quant_params"]["definition"] == 1 - and qnn_tensor_attribute["quant_params"]["encoding"] == 0 - ): - qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"] - qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"] - qnn_input_tensor_dic[qnn_tensor_name] = qnn_tensor - - # Get all graph outputs - if qnn_tensor_attribute["type"] == 1: - qnn_tensor = QnnTensorStruct() - qnn_tensor.name = qnn_tensor_name - qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(qnn_tensor_attribute["data_type"]) - qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"]) - qnn_tensor.dim = qnn_tensor_attribute["dims"] - if ( - qnn_tensor_attribute["quant_params"]["definition"] == 1 - and qnn_tensor_attribute["quant_params"]["encoding"] == 0 - ): - qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"] - qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"] - qnn_output_tensor_dic[qnn_tensor_name] = qnn_tensor + return ( + qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_8" + or qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_16" + or qnn_data_type == "QNN_DATATYPE_FIXED_POINT_8" + or qnn_data_type == "QNN_DATATYPE_FIXED_POINT_16" + ) - assert ( - len(qnn_input_tensor_dic) >= 1 and len(qnn_output_tensor_dic) >= 1 - ), "Converted QNN model not valid. It should have at least 1 input & 1 output." +def qnn_data_type_to_onnx_data_type(qnn_data_type, is_converter_json): + if is_converter_json: + # QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UINT_8 + if qnn_data_type == 0x0408 or qnn_data_type == 0x0108: + return TensorProto.UINT8 + # QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_UINT_16 + elif qnn_data_type == 0x0416 or qnn_data_type == 0x0116: + return TensorProto.UINT16 + # QNN_DATATYPE_UFIXED_POINT_32 QNN_DATATYPE_UINT_32 + elif qnn_data_type == 0x0432 or qnn_data_type == 0x0132: + return TensorProto.UINT32 + # QNN_DATATYPE_UINT_64 + elif qnn_data_type == 0x0164: + return TensorProto.UINT64 + # QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_INT_8 + elif qnn_data_type == 0x0308 or qnn_data_type == 0x0008: + return TensorProto.INT8 + # QNN_DATATYPE_FIXED_POINT_16 QNN_DATATYPE_INT_16 + elif qnn_data_type == 0x0316 or qnn_data_type == 0x0016: + return TensorProto.INT16 + # QNN_DATATYPE_FIXED_POINT_32 QNN_DATATYPE_INT_32 + elif qnn_data_type == 0x0332 or qnn_data_type == 0x0032: + return TensorProto.INT32 + # QNN_DATATYPE_INT_64 + elif qnn_data_type == 0x0064: + return TensorProto.INT64 + # QNN_DATATYPE_FLOAT_16 + elif qnn_data_type == 0x0216: + return TensorProto.FLOAT16 + # QNN_DATATYPE_FLOAT_32 + elif qnn_data_type == 0x0232: + return TensorProto.FLOAT + # QNN_DATATYPE_BOOL_8 + elif qnn_data_type == 0x0508: + return TensorProto.BOOL + else: + return TensorProto.UNDEFINED + else: + # QNN_DATATYPE_UFIXED_POINT_8 QNN_DATATYPE_UINT_8 + if qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_8" or qnn_data_type == "QNN_DATATYPE_UINT_8": + return TensorProto.UINT8 + # QNN_DATATYPE_UFIXED_POINT_16 QNN_DATATYPE_UINT_16 + elif qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_16" or qnn_data_type == "QNN_DATATYPE_UINT_16": + return TensorProto.UINT16 + # QNN_DATATYPE_UFIXED_POINT_32 QNN_DATATYPE_UINT_32 + elif qnn_data_type == "QNN_DATATYPE_UFIXED_POINT_32" or qnn_data_type == "QNN_DATATYPE_UINT_32": + return TensorProto.UINT32 + # QNN_DATATYPE_UINT_64 + elif qnn_data_type == "QNN_DATATYPE_UINT_64": + return TensorProto.UINT64 + # QNN_DATATYPE_FIXED_POINT_8 QNN_DATATYPE_INT_8 + elif qnn_data_type == "QNN_DATATYPE_FIXED_POINT_8" or qnn_data_type == "QNN_DATATYPE_INT_8": + return TensorProto.INT8 + # QNN_DATATYPE_FIXED_POINT_16 QNN_DATATYPE_INT_16 + elif qnn_data_type == "QNN_DATATYPE_FIXED_POINT_16" or qnn_data_type == "QNN_DATATYPE_INT_16": + return TensorProto.INT16 + # QNN_DATATYPE_FIXED_POINT_32 QNN_DATATYPE_INT_32 + elif qnn_data_type == "QNN_DATATYPE_FIXED_POINT_32" or qnn_data_type == "QNN_DATATYPE_INT_32": + return TensorProto.INT32 + # QNN_DATATYPE_INT_64 + elif qnn_data_type == "QNN_DATATYPE_INT_64": + return TensorProto.INT64 + # QNN_DATATYPE_FLOAT_16 + elif qnn_data_type == "QNN_DATATYPE_FLOAT_16": + return TensorProto.FLOAT16 + # QNN_DATATYPE_FLOAT_32 + elif qnn_data_type == "QNN_DATATYPE_FLOAT_32": + return TensorProto.FLOAT + # QNN_DATATYPE_BOOL_8 + elif qnn_data_type == "QNN_DATATYPE_BOOL_8": + return TensorProto.BOOL + else: + return TensorProto.UNDEFINED -# Onnxruntime QNN EP can support context binary file generated by QNN tool chain. However QNN generated context binary file -# uses channel last data layout and 8 bits or 16 bits for input and output. -# This script gets the QNN model input & output information from QNN converted model_net.json file, compare them with Onnx model -# and inserts Cast, Transpose nodes to Onnx model if required -def main(): - parser = ArgumentParser("Generate Onnx model which includes the QNN context binary.") - parser.add_argument("-b", "--qnn_bin", help="Required. Path to Qnn context binary file.", required=True, type=str) - parser.add_argument( - "-q", "--qnn_json", help="Required. Path to Qnn converted model_net.json file.", required=True, type=str - ) - parser.add_argument( - "--disable_embed_mode", - action="store_true", - default=False, - help="Set embed_mode=1 which mean embed Qnn context binary into the onnx model. Otherwise, set context binary file path in the onnx model", - ) - args = parser.parse_args() - # Parse Qnn model_net.json file to get the graph input output information - qnn_input_tensor_dic = {} - qnn_output_tensor_dic = {} - parse_qnn_json_file(args.qnn_json, qnn_input_tensor_dic, qnn_output_tensor_dic) +def parse_qnn_converter_json_file(qnn_convert_json, qnn_input_tensor_dic, qnn_output_tensor_dic): + is_qnn_converter_json = True + for qnn_tensor_name, qnn_tensor_attribute in qnn_convert_json["graph"]["tensors"].items(): + # type:0 - QNN input tensor, type:1 - QNN output tensor + assert ( + "type" in qnn_tensor_attribute and "data_type" in qnn_tensor_attribute and "dims" in qnn_tensor_attribute + ), "QNN converted json file not valid. Can't find some keys from tensors" - if args.disable_embed_mode: - ep_cache_context_content = args.qnn_bin - ctx_embed_mode = 0 - else: - with open(args.qnn_bin, "rb") as file: - ep_cache_context_content = file.read() - ctx_embed_mode = 1 + # Get all graph inputs + if qnn_tensor_attribute["type"] == 0: + qnn_tensor = QnnTensorStruct() + qnn_tensor.name = qnn_tensor_name + qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type( + qnn_tensor_attribute["data_type"], is_qnn_converter_json + ) + qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"], is_qnn_converter_json) + qnn_tensor.dim = qnn_tensor_attribute["dims"] + if ( + qnn_tensor_attribute["quant_params"]["definition"] == 1 + and qnn_tensor_attribute["quant_params"]["encoding"] == 0 + ): + qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"] + qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"] + qnn_input_tensor_dic[qnn_tensor_name] = qnn_tensor + + # Get all graph outputs + if qnn_tensor_attribute["type"] == 1: + qnn_tensor = QnnTensorStruct() + qnn_tensor.name = qnn_tensor_name + qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type( + qnn_tensor_attribute["data_type"], is_qnn_converter_json + ) + qnn_tensor.is_quantized = is_quantized_data_type(qnn_tensor_attribute["data_type"], is_qnn_converter_json) + qnn_tensor.dim = qnn_tensor_attribute["dims"] + if ( + qnn_tensor_attribute["quant_params"]["definition"] == 1 + and qnn_tensor_attribute["quant_params"]["encoding"] == 0 + ): + qnn_tensor.scale = qnn_tensor_attribute["quant_params"]["scale_offset"]["scale"] + qnn_tensor.offset = 0 - qnn_tensor_attribute["quant_params"]["scale_offset"]["offset"] + qnn_output_tensor_dic[qnn_tensor_name] = qnn_tensor + assert ( + len(qnn_input_tensor_dic) >= 1 and len(qnn_output_tensor_dic) >= 1 + ), "Converted QNN model not valid. It should have at least 1 input & 1 output." + + +def generate_wrapper_onnx_file( + grap_name, + model_file_name, + qnn_input_tensor_dic, + qnn_output_tensor_dic, + disable_embed_mode, + qnn_ctx_file, + quantized_IO, + qnn_sdk_version="unknown", +): graph_nodes = [] ini_list = [] value_infos = [] model_inputs = [] for qnn_input in qnn_input_tensor_dic.values(): - if qnn_input.is_quantized: + if qnn_input.is_quantized and not quantized_IO: q_scale_input_name = qnn_input.name + "_scale" q_offset_input_name = qnn_input.name + "_zp" q_scale = helper.make_tensor(q_scale_input_name, TensorProto.FLOAT, [], [qnn_input.scale]) @@ -170,13 +193,22 @@ def main(): else: model_inputs.append(helper.make_tensor_value_info(qnn_input.name, qnn_input.onnx_data_type, qnn_input.dim)) + if disable_embed_mode: + ep_cache_context_content = qnn_ctx_file + ctx_embed_mode = 0 + else: + with open(qnn_ctx_file, "rb") as file: + ep_cache_context_content = file.read() + ctx_embed_mode = 1 + qnn_ep_context_node = helper.make_node( "EPContext", - name="QnnContext", + name=grap_name, inputs=qnn_input_tensor_dic.keys(), outputs=qnn_output_tensor_dic.keys(), ep_cache_context=ep_cache_context_content, embed_mode=ctx_embed_mode, + ep_sdk_version=qnn_sdk_version, source="Qnn", domain="com.microsoft", ) @@ -184,7 +216,7 @@ def main(): model_outputs = [] for qnn_output in qnn_output_tensor_dic.values(): - if qnn_output.is_quantized: + if qnn_output.is_quantized and not quantized_IO: dq_scale_input_name = qnn_output.name + "_scale" dq_offset_input_name = qnn_output.name + "_zp" dq_scale = helper.make_tensor(dq_scale_input_name, TensorProto.FLOAT, [], [qnn_output.scale]) @@ -214,7 +246,120 @@ def main(): model_def = helper.make_model(graph_def, producer_name="MS") - onnx.save(model_def, args.qnn_json.replace(".json", "_qnn_ctx.onnx")) + onnx.save(model_def, model_file_name) + + +# parse Qnn graph from the json file that extracted from context binary file +def parse_qnn_graph(qnn_graph, qnn_input_tensor_dic, qnn_output_tensor_dic): + is_qnn_converter_json = False + graph_name = qnn_graph["info"]["graphName"] + raw_inputs = qnn_graph["info"]["graphInputs"] + raw_outputs = qnn_graph["info"]["graphOutputs"] + + for raw_input in raw_inputs: + tensor_info = raw_input["info"] + qnn_tensor = QnnTensorStruct() + qnn_tensor.name = tensor_info["name"] + qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(tensor_info["dataType"], is_qnn_converter_json) + qnn_tensor.is_quantized = is_quantized_data_type(tensor_info["dataType"], is_qnn_converter_json) + qnn_tensor.dim = tensor_info["dimensions"] + if ( + tensor_info["quantizeParams"]["definition"] == "QNN_DEFINITION_DEFINED" + and tensor_info["quantizeParams"]["quantizationEncoding"] == "QNN_QUANTIZATION_ENCODING_SCALE_OFFSET" + ): + qnn_tensor.scale = tensor_info["quantizeParams"]["scaleOffset"]["scale"] + qnn_tensor.offset = 0 - tensor_info["quantizeParams"]["scaleOffset"]["offset"] + qnn_input_tensor_dic[qnn_tensor.name] = qnn_tensor + + for raw_output in raw_outputs: + tensor_info = raw_output["info"] + qnn_tensor = QnnTensorStruct() + qnn_tensor.name = tensor_info["name"] + qnn_tensor.onnx_data_type = qnn_data_type_to_onnx_data_type(tensor_info["dataType"], is_qnn_converter_json) + qnn_tensor.is_quantized = is_quantized_data_type(tensor_info["dataType"], is_qnn_converter_json) + qnn_tensor.dim = tensor_info["dimensions"] + if ( + tensor_info["quantizeParams"]["definition"] == "QNN_DEFINITION_DEFINED" + and tensor_info["quantizeParams"]["quantizationEncoding"] == "QNN_QUANTIZATION_ENCODING_SCALE_OFFSET" + ): + qnn_tensor.scale = tensor_info["quantizeParams"]["scaleOffset"]["scale"] + qnn_tensor.offset = 0 - tensor_info["quantizeParams"]["scaleOffset"]["offset"] + qnn_output_tensor_dic[qnn_tensor.name] = qnn_tensor + + assert ( + len(qnn_input_tensor_dic) >= 1 and len(qnn_output_tensor_dic) >= 1 + ), "Converted QNN model not valid. It should have at least 1 input & 1 output." + + return graph_name + + +# Onnxruntime QNN EP can support context binary file generated by QNN tool chain. However QNN generated context binary file +# uses channel last data layout and 8 bits or 16 bits for input and output. +# This script gets the QNN model input & output information from QNN converted model_net.json file, compare them with Onnx model +# and inserts Cast, Transpose nodes to Onnx model if required +def main(): + parser = ArgumentParser("Generate Onnx model which includes the QNN context binary.") + parser.add_argument("-b", "--qnn_bin", help="Required. Path to Qnn context binary file.", required=True, type=str) + parser.add_argument( + "-q", "--qnn_json", help="Required. Path to Qnn converted model_net.json file.", required=True, type=str + ) + parser.add_argument( + "--disable_embed_mode", + action="store_true", + default=False, + help="Set embed_mode=1 which mean embed Qnn context binary into the onnx model. Otherwise, set context binary file path in the onnx model", + ) + parser.add_argument( + "--quantized_IO", + action="store_true", + default=False, + help="QNN converted context binary use quantized data as graph inputs and outputs. Will keep it if quantized_IO=True, otherwise, will insert Q and DQ nodes accordingly to make the graph inputs & outputs as float32 data type.", + ) + args = parser.parse_args() + + # Parse Qnn model_net.json file to get the graph input output information + + with open(args.qnn_json) as qnn_json_file: + qnn_json_obj = json.load(qnn_json_file) + if "graph" in qnn_json_obj and "tensors" in qnn_json_obj["graph"]: + print("This json file is from Qnn converter") + qnn_input_tensor_dic = {} + qnn_output_tensor_dic = {} + parse_qnn_converter_json_file(qnn_json_obj, qnn_input_tensor_dic, qnn_output_tensor_dic) + + generate_wrapper_onnx_file( + "QnnContext", + args.qnn_json.replace(".json", "_qnn_ctx.onnx"), + qnn_input_tensor_dic, + qnn_output_tensor_dic, + args.disable_embed_mode, + args.qnn_bin, + args.quantized_IO, + ) + elif "info" in qnn_json_obj and "graphs" in qnn_json_obj["info"]: + print("This json file is extracted from QNN context binary file") + qnn_version = qnn_json_obj["info"]["buildId"] + for qnn_graph in qnn_json_obj["info"]["graphs"]: + qnn_input_tensor_dic = {} + qnn_output_tensor_dic = {} + graph_name = parse_qnn_graph(qnn_graph, qnn_input_tensor_dic, qnn_output_tensor_dic) + + ctx_file_name = graph_name + "_qnn_ctx.onnx" + if not args.quantized_IO: + ctx_file_name = ctx_file_name.replace(".onnx", "_fp32_io.onnx") + + generate_wrapper_onnx_file( + graph_name, + ctx_file_name, + qnn_input_tensor_dic, + qnn_output_tensor_dic, + args.disable_embed_mode, + args.qnn_bin, + args.quantized_IO, + qnn_version, + ) + else: + print("json file unrecoginized.") if __name__ == "__main__":