From c87f420c5bb9912f6142571f6441b196568aed89 Mon Sep 17 00:00:00 2001
From: Frank Dong <frdong@microsoft.com>
Date: Wed, 23 Oct 2024 17:36:32 -0700
Subject: [PATCH] take comments

---
 include/onnxruntime/core/framework/op_kernel.h           | 2 +-
 include/onnxruntime/core/graph/graph.h                   | 2 +-
 onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc | 7 +++++--
 onnxruntime/core/framework/session_state.cc              | 3 ++-
 onnxruntime/core/framework/session_state_utils.cc        | 2 +-
 onnxruntime/core/framework/tensorprotoutils.cc           | 9 ++++-----
 onnxruntime/core/framework/tensorprotoutils.h            | 2 +-
 onnxruntime/core/framework/utils.cc                      | 4 ++--
 onnxruntime/core/framework/utils.h                       | 2 +-
 onnxruntime/core/graph/graph.cc                          | 8 ++++----
 onnxruntime/core/session/inference_session.cc            | 2 +-
 onnxruntime/test/framework/inference_session_test.cc     | 2 +-
 12 files changed, 24 insertions(+), 21 deletions(-)
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index 09747aa80148a..0a74e93baa4e5 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -79,7 +79,7 @@ class OpKernel {
   //               the allocator tied to the session if the kernel owns the pre-packed buffer or an
   //               allocator shared between sessions if the pre-packed buffer is to be shared across sessions
   //               (i.e.) the kernel does not own the buffer.
-  // @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to onnx data file.
+  // @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to external data file.
   // @param is_packed: Set it to true if the kernel packed the tensor or to false
   //                   The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
   //                   and the original initialized constant tensor will be released and not accessible anymore in
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 14caf97e953b2..69af3c93d7a07 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1150,7 +1150,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
   // Since one constant initializer could be used by different kernels
   // and prepacked differently, use an unordered_map to store prepacked
-  // initializer in format of <[initializer_name], <[kernel_name], [prepacked_initializer]>>
+  // initializer in format of <[initializer_name], <[node_name], [prepacked_initializer]>>
   typedef std::unordered_map<std::string, std::unordered_map<std::string, ONNX_NAMESPACE::TensorProto>> PrePackedTensorProtoToSave;
 
 #if !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 1bcd1b372d19f..7c568a6f7547b 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -282,13 +282,16 @@ std::optional<Tensor> MatMulNBits<T1>::GetPrePackTensor(int input_idx) {
   // Inorder to cope with this logic, we need to return latest prepacked buffer and only serialize
   // the latest one. So, we need to always return packed_tensor_ here not only for input_B.
   ORT_UNUSED_PARAMETER(input_idx);
-  return std::move(packed_tensor_.value());
+  return std::move(packed_tensor_);
 }
 
 template <typename T1>
 Status MatMulNBits<T1>::SetPrePackTensor(int input_idx, Tensor& pre_packed_tensor) {
   if (input_idx == 1) {
-    packed_b_ = BufferUniquePtr(pre_packed_tensor.MutableDataRaw());
+    // pre_packed_tensor is constant initialized tensor and its lifecycle is managed by session_state,
+    // session_state will release memory from pre_packed_tensor. packed_b_ will not release memory so
+    // pass empty/default buffer deleter here.
+    packed_b_ = BufferUniquePtr(pre_packed_tensor.MutableDataRaw(), BufferDeleter());
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 40a7819ade0af..6d4ef0af2c488 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -437,7 +437,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                   // If prepacked weights already read from ONNX data file (this happens we ORT reads data file with prepacked
                   // weights serialized), only need to set prepacked weights once to kernel.
                   is_kernel_prepacked = true;
-                  ORT_THROW_IF_ERROR(kernel->SetPrePackTensor(input_idx, *(constant_initialized_tensors[ort_value_idx].GetMutable<Tensor>())));
+                  ORT_THROW_IF_ERROR(kernel->SetPrePackTensor(input_idx, 
+                                                              *(constant_initialized_tensors[ort_value_idx].GetMutable<Tensor>())));
                 }
                 // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now
                 else if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index 39abdac5181e6..3424f40e79c01 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -79,7 +79,7 @@ static inline common::Status ExtDataTensorProtoToTensor(const Env& env,
   SafeInt<size_t> ext_data_len = 0;
   ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto,
                                                        ext_data_buf, ext_data_len, ext_data_deleter,
-                                                       pre_packed_initializers_name_set, buffered_tensor));
+                                                       &pre_packed_initializers_name_set, buffered_tensor));
 
   // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be
   // avoided if the Tensor class implements the do-nothing behavior when given a
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index d07a68ad32669..83eec9632054c 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -1000,7 +1000,7 @@ static Status GetFileContent(const Env& env, const std::filesystem::path& file_p
 Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& model_path,
                                  const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf,
                                  SafeInt<size_t>& ext_data_len, OrtCallback& ext_data_deleter,
-                                 SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set, Tensor* buffered_tensor) {
+                                 SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile* pre_packed_initializers_name_set, Tensor* buffered_tensor) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
   std::basic_string<ORTCHAR_T> tensor_proto_dir;
   if (!model_path.empty()) {
@@ -1013,8 +1013,8 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
   ORT_RETURN_IF_ERROR(
       GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len, pre_packed));
 
-  if (pre_packed) {
-    pre_packed_initializers_name_set.insert(tensor_proto.name());
+  if (pre_packed && pre_packed_initializers_name_set != nullptr) {
+    (*pre_packed_initializers_name_set).insert(tensor_proto.name());
   }
 
   if (external_data_file_path == onnxruntime::utils::kTensorProtoMemoryAddressTag) {
@@ -1123,10 +1123,9 @@ Status TensorProtoToTensor(const Env& env, const std::filesystem::path& model_pa
   SafeInt<size_t> raw_data_len = 0;
   AutoDelete deleter_for_file_data;
   OrtCallback& d = deleter_for_file_data.d;
-  SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile pre_packed_initializers_name_set;
 
   if (utils::HasExternalData(tensor_proto)) {
-    ORT_RETURN_IF_ERROR(GetExtDataFromTensorProto(env, model_path, tensor_proto, raw_data, raw_data_len, d, pre_packed_initializers_name_set));
+    ORT_RETURN_IF_ERROR(GetExtDataFromTensorProto(env, model_path, tensor_proto, raw_data, raw_data_len, d, nullptr));
   } else if (utils::HasRawData(tensor_proto)) {
     raw_data = const_cast<char*>(tensor_proto.raw_data().data());
     // TODO The line above has const-correctness issues. Below is a possible fix which copies the tensor_proto data
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index 1f2288f84ebac..3a3bd1a881fce 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -159,7 +159,7 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
                                          const ONNX_NAMESPACE::TensorProto& tensor_proto,
                                          void*& ext_data_buf, SafeInt<size_t>& ext_data_len,
                                          OrtCallback& ext_data_deleter,
-                                         SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile& pre_packed_initializers_name_set,
+                                         SessionState::PrePackInitializers::PrePackedTensorNamesReadFromFile* pre_packed_initializers_name_set,
                                          Tensor* buffered_tensor = nullptr);
 
 // Given a tensor proto with external data obtain a tensor using the specified custom external data loader.
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 2a06cbd398638..5402345447706 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -1064,10 +1064,10 @@ bool IsOutputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index
   return false;
 }
 
-std::string GetPrepackedInitializerName(const std::string& initializer_name, const std::string& kernel_name) {
+std::string GetPrepackedInitializerName(const std::string& initializer_name, const std::string& node_name) {
   const std::string seperator = ":";
 
-  return initializer_name + seperator + kernel_name;
+  return initializer_name + seperator + node_name;
 }
 
 }  // namespace utils
diff --git a/onnxruntime/core/framework/utils.h b/onnxruntime/core/framework/utils.h
index e27fb7a6d833a..db38ef1675595 100644
--- a/onnxruntime/core/framework/utils.h
+++ b/onnxruntime/core/framework/utils.h
@@ -234,7 +234,7 @@ constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<UInt4x2>() {
 
 int32_t ONNXTensorElementDataTypeToProtoTensorType(ONNXTensorElementDataType);
 
-std::string GetPrepackedInitializerName(const std::string& initializer_name, const std::string& kernel_name);
+std::string GetPrepackedInitializerName(const std::string& initializer_name, const std::string& node_name);
 
 #ifdef ENABLE_TRAINING
 common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context);
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 94ae6caf2ae23..3f50841f50913 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -4175,19 +4175,19 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std
     InlinedVector<TensorProto> pre_packed_initializers_tensor_proto;
     // If this initializer has been prepacked, saved prepacked external initializer instead of original one.
     // Since one initializer could be used by multiple kernels and been prepacked differently,
-    // Save each prepacked initializers seperately, chagne the initializer name to [initializer_name]:[kernel_name]
+    // Save each prepacked initializers seperately, chagne the initializer name to [initializer_name]:[node_name]
     // to avoid conflict. Change the node input name accordingly.
     // IT could potentially make the ONNX data file larger since we store multiple prepacked initializers into disk
     // but this could be rare case.
     if (save_prepacked_constant_initializers && pre_packed_initializers.count(initializer.name())) {
       for (const auto& item : pre_packed_initializers[initializer.name()]) {
-        auto& kernel_name = item.first;
-        std::string prepacked_initializer_name = utils::GetPrepackedInitializerName(initializer.name(), kernel_name);
+        auto& node_name = item.first;
+        std::string prepacked_initializer_name = utils::GetPrepackedInitializerName(initializer.name(), node_name);
         pre_packed_initializers_tensor_proto.push_back(item.second);
         use_pre_packed_initializer = true;
 
         for (auto& node : *result.mutable_node()) {
-          if (node.name() == kernel_name) {
+          if (node.name() == node_name) {
             int input_index = 0;
             for (const auto& input : node.input()) {
               if (input == initializer.name()) {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index eef760bfe026d..e6aafaa1f2283 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2028,7 +2028,6 @@ common::Status InferenceSession::Initialize() {
     }
 
     SessionState::PrePackInitializers pre_packed_initializers;
-    Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
     ORT_RETURN_IF_ERROR_SESSIONID_(
         session_state_->FinalizeSessionState(model_location_, kernel_registry_manager_,
                                              // need to keep the initializers if saving the optimized model
@@ -2070,6 +2069,7 @@ common::Status InferenceSession::Initialize() {
           align_info.align_offset = true;
           bool save_prepacked_constant_initializers =
               session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers, "0") == "1" ? true : false;
+          Graph::PrePackedTensorProtoToSave pre_packed_initializers_tensor_proto;
           if (save_prepacked_constant_initializers) {
             LOGS(*session_logger_, WARNING) << "Serialize prepacked initializers option has been turn on."
                                             << "Use this option only when run model inference on PC with CPU."
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index daab52443c71f..99d8aa101ea2e 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -504,7 +504,7 @@ TEST(InferenceSessionTests, TestModelSerialization) {
 #if !ENABLE_TRAINING && !defined(USE_CUDA) && !defined(__wasm__) && !defined(USE_DNNL) && !defined(USE_QNN) && !defined(__ANDROID__) && !defined(USE_COREML)
 // MLAS dispatcher used in matmul_nbits kernels here is 64 bit only
 #if defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)
-TEST(InferenceSessionTests, TestPrepackSerialization) {
+TEST(InferenceSessionTests, TestPrePackSerialization) {
   SessionOptions so;
   std::string model_name = "model_with_matmul_nbits";