From 8931854528b1b2a3f320d012c78d37186fbbdab8 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Wed, 20 Dec 2023 00:13:38 -0800 Subject: [PATCH] Move some QNN EP provider options to session options (#18877) Move QNN EP provider options to session options ### Description Need to use session option to support multi-partition for context cache feature. To smooth the transaction, move the provider options to session options first. This is the first step for PR: PR https://github.com/microsoft/onnxruntime/pull/18865 --- .../core/session/onnxruntime_c_api.h | 6 -- .../onnxruntime_session_options_config_keys.h | 15 ++++ .../qnn/builder/onnx_ctx_model_helper.cc | 2 +- .../providers/qnn/qnn_execution_provider.cc | 34 ++++------ onnxruntime/test/onnx/main.cc | 30 ++++---- .../test/perftest/command_args_parser.cc | 2 - onnxruntime/test/perftest/ort_test_session.cc | 10 +-- .../test/providers/qnn/qnn_basic_test.cc | 37 +++++++--- .../test/providers/qnn/qnn_test_utils.cc | 20 ++++-- .../test/providers/qnn/qnn_test_utils.h | 28 +++++--- .../test/providers/qnn/simple_op_htp_test.cc | 68 ++++++++++++++----- onnxruntime/test/util/default_providers.cc | 8 ++- .../test/util/include/default_providers.h | 5 +- .../azure-pipelines/linux-qnn-ci-pipeline.yml | 10 +-- 14 files changed, 171 insertions(+), 104 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index c41700453a73b..dbd5ad41255fa 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -3593,17 +3593,11 @@ struct OrtApi { * * QNN supported keys: * "backend_path": file path to QNN backend library. - * "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will - * load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist - * "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided. * "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off. * "rpc_control_latency": QNN RPC control latency. * "vtcm_mb": QNN VTCM size in MB. default to 0(not set). * "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance", * "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default". - * "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model. - * 0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context. - * The path is relative path to the ONNX skeleton model file. * "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will * dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and * may alter model/EP partitioning. Use only for debugging. diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index a94973b2cc5d7..df79cb6e5b21b 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -235,3 +235,18 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil // Use this config to control the minimum size of the initializer when externalizing it during serialization static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes = "session.optimized_model_external_initializers_min_size_in_bytes"; + +// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file. +// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead. +// "0": disable. (default) +// "1": enable. +static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable"; + +// Specify the file path for the Onnx model which has EP context. +// Default to original_file_name_ctx.onnx if not specified +static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path"; + +// Flag to specify whether to dump the EP context into the Onnx model. +// "0": dump the EP context into separate file, keep the file name in the Onnx model. +// "1": dump the EP context into the Onnx model. (default). +static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode"; \ No newline at end of file diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc index 234b957816662..b157396306d01 100644 --- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc @@ -160,7 +160,7 @@ bool IsContextCacheFileExists(const std::string& customer_context_cache_path, if (!customer_context_cache_path.empty()) { context_cache_path = ToPathString(customer_context_cache_path); } else if (!model_pathstring.empty()) { - context_cache_path = model_pathstring + ToPathString("_qnn_ctx.onnx"); + context_cache_path = model_pathstring + ToPathString("_ctx.onnx"); } return std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 60f7bbe08cb6a..c72012fd4a19b 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -114,29 +114,23 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio if (session_options) { disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault( kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; - } - - static const std::string CONTEXT_CACHE_ENABLED = "qnn_context_cache_enable"; - auto context_cache_enabled_pos = provider_options_map.find(CONTEXT_CACHE_ENABLED); - if (context_cache_enabled_pos != provider_options_map.end()) { - if (context_cache_enabled_pos->second == "1") { - context_cache_enabled_ = true; - LOGS_DEFAULT(VERBOSE) << "Context cache enabled."; - } - } - static const std::string CONTEXT_CACHE_PATH = "qnn_context_cache_path"; - auto context_cache_path_pos = provider_options_map.find(CONTEXT_CACHE_PATH); - if (context_cache_path_pos != provider_options_map.end()) { - context_cache_path_cfg_ = context_cache_path_pos->second; - LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_; - } + context_cache_enabled_ = session_options->config_options.GetConfigOrDefault( + kOrtSessionOptionEpContextEnable, "0") == "1"; + LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_; - static const std::string CONTEXT_CACHE_EMBED_MODE = "qnn_context_embed_mode"; - auto context_cache_embed_mode_pos = provider_options_map.find(CONTEXT_CACHE_EMBED_MODE); - if (context_cache_embed_mode_pos != provider_options_map.end()) { - qnn_context_embed_mode_ = context_cache_embed_mode_pos->second == "1"; + std::string embed_mode = session_options->config_options.GetConfigOrDefault( + kOrtSessionOptionEpContextEmbedMode, "1"); + if ("1" == embed_mode) { + qnn_context_embed_mode_ = true; + } else if ("0" == embed_mode) { + qnn_context_embed_mode_ = false; + } else { + LOGS_DEFAULT(VERBOSE) << "Invalid ep.context_embed_mode: " << embed_mode << " only 0 or 1 allowed. Set to 1."; + } LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_; + + context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); } static const std::string BACKEND_PATH = "backend_path"; diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 646ff7c95b229..51edb91b5d3af 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -50,15 +50,12 @@ void usage() { "\t-a: Specify custom absolute tolerance values for output value comparison. default: 1e-5\n" "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n" "\t [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n" - "\t [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n" - "\t [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n" "\t [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n" "\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n" "\t [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n" "\t [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n" "\t 'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n" "\t [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n" - "\t [QNN only] [qnn_context_embed_mode]: 1 means dump the QNN context binary into the Onnx skeleton model.\n" "\t 0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n" "\t [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n" "\t [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n" @@ -73,6 +70,8 @@ void usage() { "\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n" "\t-o [optimization level]: Default is 99. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n" "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels. " + "\t-f: Enable EP context cache generation.\n" + "\t-b: Disable EP context embed mode.\n" "\n" "\t-h: help\n" "\n" @@ -179,11 +178,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) { OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_ERROR; bool verbose_logging_required = false; + bool ep_context_enable = false; + bool disable_ep_context_embed_mode = false; bool pause = false; { int ch; - while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pz"))) != -1) { + while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pzfb"))) != -1) { switch (ch) { case 'A': enable_cpu_mem_arena = false; @@ -312,6 +313,12 @@ int real_main(int argc, char* argv[], Ort::Env& env) { case 'z': set_denormal_as_zero = true; break; + case 'b': + disable_ep_context_embed_mode = true; + break; + case 'f': + ep_context_enable = true; + break; case '?': case 'h': default: @@ -386,6 +393,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) { if (set_denormal_as_zero) sf.AddConfigEntry(kOrtSessionOptionsConfigSetDenormalAsZero, "1"); + if (ep_context_enable) + sf.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1"); + if (disable_ep_context_embed_mode) + sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0"); + if (enable_tensorrt) { #ifdef USE_TENSORRT OrtCUDAProviderOptions cuda_options; @@ -466,12 +478,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) { if (value != "0") { ORT_THROW("Set to 0 to disable qnn_context_embed_mode."); } - } else if (key == "qnn_context_cache_enable") { - if (value != "1") { - ORT_THROW("Set to 1 to enable qnn_context_cache_enable."); - } - } else if (key == "qnn_context_cache_path") { - // no validation } else if (key == "profiling_level") { std::set supported_profiling_level = {"off", "basic", "detailed"}; if (supported_profiling_level.find(value) == supported_profiling_level.end()) { @@ -507,8 +513,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) { ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str); } } else { - ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable', -'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', + ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', +'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', 'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])"); } diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 27e26fe0b3c45..6e3252aaeb4b8 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -65,8 +65,6 @@ namespace perftest { "\t [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n" "\t [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n" "\t [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n" - "\t [QNN only] [qnn_context_cache_enable]: 1 to enable cache QNN context. Default to false.\n" - "\t [QNN only] [qnn_context_cache_path]: File path to the qnn context cache. Default to model_file.onnx.bin if not set.\n" "\t [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n" "\t [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n" "\t [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 6a99d6a0b0246..04c9ae1f23108 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -332,12 +332,6 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device if (value.empty()) { ORT_THROW("Please provide the QNN backend path."); } - } else if (key == "qnn_context_cache_enable") { - if (value != "1") { - ORT_THROW("Set to 1 to enable qnn_context_cache_enable."); - } - } else if (key == "qnn_context_cache_path") { - // no validation } else if (key == "profiling_level") { std::set supported_profiling_level = {"off", "basic", "detailed"}; if (supported_profiling_level.find(value) == supported_profiling_level.end()) { @@ -373,8 +367,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device ORT_THROW("Supported qnn_context_priority: low, normal, normal_high, high"); } } else { - ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'qnn_context_cache_enable', -'qnn_context_cache_path', 'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', + ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', +'profiling_level', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', 'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority'])"); } diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index e30c79eca3a13..391d7bebc9589 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -375,17 +375,36 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif - provider_options["qnn_context_cache_enable"] = "1"; + + // Add kMSDomain to cover contrib op like Gelu + const std::unordered_map domain_to_version = {{"", 13}, {kMSDomain, 1}}; + + auto& logging_manager = DefaultLoggingManager(); + logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR); + + onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + logging_manager.DefaultLogger()); + Graph& graph = model.MainGraph(); + ModelTestBuilder helper(graph); + BuildCastAddTestCase()(helper); + helper.SetGraphOutputs(); + ASSERT_STATUS_OK(model.MainGraph().Resolve()); + + // Serialize the model to a string. + std::string model_data; + model.ToProto().SerializeToString(&model_data); + + const auto model_data_span = AsByteSpan(model_data.data(), model_data.size()); + const std::string context_binary_file = "./qnn_context_binary_int32_fp32_inputs_test.onnx"; - provider_options["qnn_context_cache_path"] = context_binary_file; + Ort::SessionOptions so; + so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1"); + so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str()); - RunQnnModelTest(BuildCastAddTestCase(), - provider_options, - 13, // opset - ExpectedEPNodeAssignment::All, - 1e-5f, - logging::Severity::kERROR, - false); + so.AppendExecutionProvider("QNN", provider_options); + + Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), so); // Make sure the Qnn context cache binary file is generated EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str())); diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc index 4c38109d30371..f5ebe45a07912 100644 --- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc +++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc @@ -13,6 +13,7 @@ #include "core/common/span_utils.h" #include "core/framework/compute_capability.h" #include "core/graph/graph.h" +#include "core/session/onnxruntime_session_options_config_keys.h" namespace onnxruntime { namespace test { @@ -106,24 +107,31 @@ void RunQnnModelTest(const GetTestModelFn& build_test_case, ProviderOptions prov TryEnableQNNSaver(provider_options); RunAndVerifyOutputsWithEP(AsByteSpan(model_data.data(), model_data.size()), "QNN_EP_TestLogID", QnnExecutionProviderWithOptions(provider_options), - helper.feeds_, verification_params, {}, verify_outputs); + helper.feeds_, verification_params, + {}, verify_outputs); } void InferenceModel(const std::string& model_data, const char* log_id, - std::unique_ptr execution_provider, + const ProviderOptions& provider_options, ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds, - std::vector& output_vals) { + std::vector& output_vals, + bool is_qnn_ep, + const std::unordered_map& session_option_pairs) { SessionOptions so; so.session_logid = log_id; + for (auto key_value : session_option_pairs) { + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(key_value.first.c_str(), key_value.second.c_str())); + } RunOptions run_options; run_options.run_tag = so.session_logid; InferenceSessionWrapper session_object{so, GetEnvironment()}; std::string provider_type = kCpuExecutionProvider; - if (execution_provider) { - provider_type = execution_provider->Type(); - ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(execution_provider))); + if (is_qnn_ep) { + auto qnn_ep = QnnExecutionProviderWithOptions(provider_options, &so); + provider_type = qnn_ep->Type(); + ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(qnn_ep))); } ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast(model_data.size()))); ASSERT_STATUS_OK(session_object.Initialize()); diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h index 9ec0985e8130c..bfe5bab318313 100644 --- a/onnxruntime/test/providers/qnn/qnn_test_utils.h +++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h @@ -220,15 +220,19 @@ inline QuantParams GetTestInputQuantParams(const TestInputDef& inp * * \param model_data The serialized ONNX model to inference. * \param log_id The logger ID. - * \param execution_provider The EP on which to run the model. Set to nullptr for CPU EP. + * \param provider_options provider options key value pair. * \param expected_ep_assignment Describes "which nodes" should be assigned to the EP. * \param feeds The input feeds. * \param output_vals Initialized to the inference results. + * \param is_qnn_ep Ture: QNN EP is used. False: CPU EP is used (default). + * \param session_option_pairs extra session options. */ void InferenceModel(const std::string& model_data, const char* log_id, - std::unique_ptr execution_provider, + const ProviderOptions& provider_options, ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds, - std::vector& output_vals); + std::vector& output_vals, + bool is_qnn_ep = false, + const std::unordered_map& session_option_pairs = {}); /** * If the ORT_UNIT_TEST_ENABLE_QNN_SAVER environment variable is enabled (set to 1), this function modifies @@ -287,7 +291,8 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe ExpectedEPNodeAssignment expected_ep_assignment, QDQTolerance tolerance = QDQTolerance(), logging::Severity log_severity = logging::Severity::kERROR, - const std::string& qnn_ctx_model_path = "") { + const std::string& qnn_ctx_model_path = "", + const std::unordered_map& session_option_pairs = {}) { // Add kMSDomain to cover contrib op like Gelu const std::unordered_map domain_to_version = {{"", opset_version}, {kMSDomain, 1}}; @@ -307,7 +312,7 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe // Run f32 model on CPU EP and collect outputs. std::vector cpu_f32_outputs; - InferenceModel(f32_model_data, "f32_model_logger", nullptr, ExpectedEPNodeAssignment::All, + InferenceModel(f32_model_data, "f32_model_logger", {}, ExpectedEPNodeAssignment::All, f32_helper.feeds_, cpu_f32_outputs); ASSERT_FALSE(cpu_f32_outputs.empty()); @@ -344,7 +349,7 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe ASSERT_STATUS_OK(qdq_model.MainGraph().Resolve()); qdq_model.ToProto().SerializeToString(&qdq_model_data); - // Run QDQ model on QNN EP and collect outputs. + bool is_qnn_ep = true; TryEnableQNNSaver(qnn_options); std::vector qnn_qdq_outputs; if (!qnn_ctx_model_path.empty()) { @@ -355,18 +360,19 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe std::string qnn_ctx_model_data; model_proto.SerializeToString(&qnn_ctx_model_data); // Run QNN context cache model on QNN EP and collect outputs. - InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", QnnExecutionProviderWithOptions(qnn_options), - expected_ep_assignment, qdq_helper.feeds_, qnn_qdq_outputs); + InferenceModel(qnn_ctx_model_data, "qnn_ctx_model_logger", qnn_options, + expected_ep_assignment, qdq_helper.feeds_, qnn_qdq_outputs, is_qnn_ep); } else { // Run QDQ model on QNN EP and collect outputs. - InferenceModel(qdq_model_data, "qdq_model_logger", QnnExecutionProviderWithOptions(qnn_options), - expected_ep_assignment, qdq_helper.feeds_, qnn_qdq_outputs); + // Only need to apply the extra session options to this QDQ model inference on QNN EP + InferenceModel(qdq_model_data, "qdq_model_logger", qnn_options, expected_ep_assignment, + qdq_helper.feeds_, qnn_qdq_outputs, is_qnn_ep, session_option_pairs); } if (expected_ep_assignment != ExpectedEPNodeAssignment::None) { // Run QDQ model on CPU EP and collect outputs. std::vector cpu_qdq_outputs; - InferenceModel(qdq_model_data, "qdq_model_logger", nullptr, ExpectedEPNodeAssignment::All, + InferenceModel(qdq_model_data, "qdq_model_logger", {}, ExpectedEPNodeAssignment::All, qdq_helper.feeds_, cpu_qdq_outputs); ASSERT_EQ(cpu_qdq_outputs.size(), num_outputs); ASSERT_EQ(qnn_qdq_outputs.size(), num_outputs); diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc index 39733f50482a6..8ff65c08e8633 100644 --- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc @@ -8,6 +8,7 @@ #include #include "core/graph/graph.h" #include "core/graph/node_attr_utils.h" +#include "core/session/onnxruntime_session_options_config_keys.h" #include "test/optimizer/qdq_test_utils.h" #include "test/providers/qnn/qnn_test_utils.h" @@ -733,9 +734,11 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheEmbedModeTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif - provider_options["qnn_context_cache_enable"] = "1"; const std::string context_binary_file = "./qnn_context_binary_test.onnx"; - provider_options["qnn_context_cache_path"] = context_binary_file; + + std::unordered_map session_option_pairs; + session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1"); + session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file); const TestInputDef input_def({1, 2, 3}, false, -10.0f, 10.0f); const std::string op_type = "Atan"; @@ -746,7 +749,11 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheEmbedModeTest) { BuildQDQOpTestCase(op_type, {input_def}, {}, {}), provider_options, 14, - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, + QDQTolerance(), + logging::Severity::kERROR, + "", // context model file path, not required for this inference + session_option_pairs); // Make sure the Qnn context cache binary file is generated EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str())); @@ -756,7 +763,11 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheEmbedModeTest) { BuildQDQOpTestCase(op_type, {input_def}, {}, {}), provider_options, 14, - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, + QDQTolerance(), + logging::Severity::kERROR, + "", // context model file path, not required for this inference + session_option_pairs); // 3rd run directly loads and run from Qnn context cache model TestQDQModelAccuracy(BuildOpTestCase(op_type, {input_def}, {}, {}), @@ -780,10 +791,11 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif - provider_options["qnn_context_cache_enable"] = "1"; const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx"; - provider_options["qnn_context_cache_path"] = context_binary_file; - provider_options["qnn_context_embed_mode"] = "0"; + std::unordered_map session_option_pairs; + session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1"); + session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file); + session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0"); const TestInputDef input_def({1, 2, 3}, false, -10.0f, 10.0f); const std::string op_type = "Atan"; @@ -794,7 +806,11 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) { BuildQDQOpTestCase(op_type, {input_def}, {}, {}), provider_options, 14, - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, + QDQTolerance(), + logging::Severity::kERROR, + "", // context model file path, not required for this inference + session_option_pairs); // Check the Onnx skeleton file is generated EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str())); @@ -806,7 +822,11 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) { BuildQDQOpTestCase(op_type, {input_def}, {}, {}), provider_options, 14, - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, + QDQTolerance(), + logging::Severity::kERROR, + "", // context model file path, not required for this inference + session_option_pairs); // 3rd run directly loads and run from Onnx skeleton file + Qnn context cache binary file TestQDQModelAccuracy(BuildOpTestCase(op_type, {input_def}, {}, {}), @@ -829,10 +849,11 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif - provider_options["qnn_context_cache_enable"] = "1"; const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx"; - provider_options["qnn_context_cache_path"] = context_binary_file; - provider_options["qnn_context_embed_mode"] = "0"; + std::unordered_map session_option_pairs; + session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1"); + session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file); + session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0"); const TestInputDef input_def({1, 2, 3}, false, -10.0f, 10.0f); const std::string op_type = "Atan"; @@ -843,7 +864,11 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) { BuildQDQOpTestCase(op_type, {input_def}, {}, {}), provider_options, 14, - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, + QDQTolerance(), + logging::Severity::kERROR, + "", // context model file path, not required for this inference + session_option_pairs); // Check the Onnx skeleton file is generated EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str())); @@ -886,9 +911,10 @@ TEST_F(QnnHTPBackendTests, ContextBinary2InputsTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif - provider_options["qnn_context_cache_enable"] = "1"; const std::string context_binary_file = "./qnn_context_binary_2inputs_test.onnx"; - provider_options["qnn_context_cache_path"] = context_binary_file; + std::unordered_map session_option_pairs; + session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1"); + session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file); const TestInputDef input_def1({1, 2, 3}, false, -10.0f, 10.0f); const TestInputDef input_def2({1, 2, 3}, false, -10.0f, 10.0f); @@ -900,7 +926,11 @@ TEST_F(QnnHTPBackendTests, ContextBinary2InputsTest) { BuildQDQOpTestCase(op_type, {input_def1, input_def2}, {}, {}), provider_options, 14, - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, + QDQTolerance(), + logging::Severity::kERROR, + "", // context model file path, not required for this inference + session_option_pairs); // Make sure the Qnn context cache binary file is generated EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str())); @@ -910,7 +940,11 @@ TEST_F(QnnHTPBackendTests, ContextBinary2InputsTest) { BuildQDQOpTestCase(op_type, {input_def1, input_def2}, {}, {}), provider_options, 14, - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, + QDQTolerance(), + logging::Severity::kERROR, + "", // context model file path, not required for this inference + session_option_pairs); // 3rd run directly loads and run from Qnn context cache model TestQDQModelAccuracy(BuildOpTestCase(op_type, {input_def1, input_def2}, {}, {}), diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index 65646a7286719..4468a64d18258 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -9,8 +9,10 @@ #include "core/providers/coreml/coreml_provider_factory.h" #endif #include "core/session/onnxruntime_cxx_api.h" +#include "core/framework/session_options.h" namespace onnxruntime { + namespace test { std::unique_ptr DefaultCpuExecutionProvider(bool enable_arena) { @@ -242,11 +244,13 @@ std::unique_ptr DefaultQnnExecutionProvider() { #endif } -std::unique_ptr QnnExecutionProviderWithOptions(const ProviderOptions& options) { +std::unique_ptr QnnExecutionProviderWithOptions(const ProviderOptions& options, + const SessionOptions* session_options) { #ifdef USE_QNN - return QNNProviderFactoryCreator::Create(options, nullptr)->CreateProvider(); + return QNNProviderFactoryCreator::Create(options, session_options)->CreateProvider(); #else ORT_UNUSED_PARAMETER(options); + ORT_UNUSED_PARAMETER(session_options); return nullptr; #endif } diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h index 1325f7aa43dbb..9f78e0a0d4eb2 100644 --- a/onnxruntime/test/util/include/default_providers.h +++ b/onnxruntime/test/util/include/default_providers.h @@ -8,6 +8,8 @@ namespace onnxruntime { +struct SessionOptions; + std::shared_ptr CreateExecutionProviderFactory_ACL(int use_arena); std::shared_ptr CreateExecutionProviderFactory_ArmNN(int use_arena); std::shared_ptr CreateExecutionProviderFactory_CoreML(uint32_t); @@ -52,7 +54,8 @@ std::unique_ptr DefaultRocmExecutionProvider(bool test_tunab std::unique_ptr DefaultCoreMLExecutionProvider(); std::unique_ptr DefaultSnpeExecutionProvider(); std::unique_ptr DefaultQnnExecutionProvider(); -std::unique_ptr QnnExecutionProviderWithOptions(const ProviderOptions& options); +std::unique_ptr QnnExecutionProviderWithOptions(const ProviderOptions& options, + const SessionOptions* session_options = nullptr); std::unique_ptr DefaultXnnpackExecutionProvider(); std::unique_ptr DefaultCannExecutionProvider(); std::unique_ptr DefaultDmlExecutionProvider(); diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml index d21b917cbd10e..07e69ff496720 100644 --- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml @@ -110,13 +110,5 @@ jobs: inputs: script: | ./build/Release/onnx_test_runner -e qnn \ - -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so qnn_context_cache_enable|1 qnn_context_cache_path|./build/Release/mobilenet_qdq.onnx_qnn_ctx.onnx" \ - /data/qdq_models/mobilenetv2-1.0_add_transpose_quant - - - task: CmdLine@2 - displayName: Run QDQ model tests with load from cached context - inputs: - script: | - ./build/Release/onnx_test_runner -e qnn \ - -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so qnn_context_cache_enable|1 qnn_context_cache_path|./build/Release/mobilenet_qdq.onnx_qnn_ctx.onnx" \ + -v -f -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so" \ /data/qdq_models/mobilenetv2-1.0_add_transpose_quant