diff --git a/.lintrunner.toml b/.lintrunner.toml
index c44a66200ad1b..4e5d077b08ff4 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -45,6 +45,7 @@ exclude_patterns = [
     'cmake/external/**',
     # ignore generated flatbuffers code
     'onnxruntime/core/flatbuffers/ort_flatbuffers_py/**',
+    'orttraining/orttraining/python/training/optim/_ds_code_store.py',
 ]
 command = [
     'python',
@@ -76,6 +77,7 @@ exclude_patterns = [
     'cmake/**',
     'orttraining/*',
     'onnxruntime/core/flatbuffers/**',
+    'orttraining/orttraining/python/training/optim/_ds_code_store.py',
 ]
 command = [
     'python',
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 08ca90d7c3b7f..f9f2fbdab7b10 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -2,6 +2,36 @@
   "$schema": "https://json.schemastore.org/component-detection-manifest.json",
   "Version": 1,
   "Registrations": [
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "a896e3d066448b3530dbcaa48869fafefd738f57",
+          "repositoryUrl": "https://github.com/emscripten-core/emsdk.git"
+        },
+        "comments": "git submodule at cmake/external/emsdk"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "7a2ed51a6b682a83e345ff49fc4cfd7ca47550db",
+          "repositoryUrl": "https://github.com/google/libprotobuf-mutator.git"
+        },
+        "comments": "git submodule at cmake/external/libprotobuf-mutator"
+      }
+    },
+    {
+      "component": {
+        "type": "git",
+        "git": {
+          "commitHash": "0c296085f9f65f0f8ef7aec7b9eed55faf37dc40",
+          "repositoryUrl": "https://github.com/onnx/onnx.git"
+        },
+        "comments": "git submodule at cmake/external/onnx"
+      }
+    },
     {
       "component": {
         "type": "git",
@@ -166,7 +196,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "fdefbe85ed9c362b95b9b401cd19db068a76141f",
+          "commitHash": "6a20ba82b439ea1fd650da4d389e96b60a1dd828",
           "repositoryUrl": "https://github.com/onnx/onnx.git"
         },
         "comments": "onnx"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 7cf49f02333a4..26fd35075c4b9 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -24,7 +24,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/14303de049144035dfd94ace5f7a3b44773b1aad.zip;250eab9690392b248d75b56e605fb49eca373442
+onnx;https://github.com/onnx/onnx/archive/6a20ba82b439ea1fd650da4d389e96b60a1dd828.zip;179a22ad4cd67109c60031ae4b6cf2f434d8bd7e
 #use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/0462dc31ae78f48744b6141ae376df1f96d3f459.zip;5ff086361956cceb81ed17453a1fd8db2aa4328d
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/onnx b/cmake/external/onnx
index e2525550194ce..6a20ba82b439e 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit e2525550194ce3d8a2c4a3af451c9d9b3ae6650e
+Subproject commit 6a20ba82b439ea1fd650da4d389e96b60a1dd828
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 992908392c946..a62b1b259d109 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -339,12 +339,18 @@ else()
           set(mlas_platform_srcs
             ${mlas_platform_srcs}
             ${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
+            ${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S
+            ${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S
             ${MLAS_SRC_DIR}/activate_fp16.cpp
             ${MLAS_SRC_DIR}/dwconv.cpp
             ${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
             ${MLAS_SRC_DIR}/pooling_fp16.cpp
+            ${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
+            ${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
           )
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
+          set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
           set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index fe3e577b4fc36..de1458c120016 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -54,6 +54,11 @@ set(contrib_ops_excluded_files
   "quantization/attention_quantization_impl.cuh"
   "quantization/dequantize_blockwise.cuh"
   "quantization/dequantize_blockwise.cu"
+  "quantization/dequantize_blockwise_bnb4.cuh"
+  "quantization/dequantize_blockwise_bnb4.cu"
+  "quantization/matmul_bnb4.cc"
+  "quantization/matmul_bnb4.cuh"
+  "quantization/matmul_bnb4.cu"
   "quantization/matmul_nbits.cc"
   "quantization/matmul_nbits.cuh"
   "quantization/matmul_nbits.cu"
diff --git a/dockerfiles/Dockerfile.source b/dockerfiles/Dockerfile.source
index 110e484e77d21..5822a805c674e 100644
--- a/dockerfiles/Dockerfile.source
+++ b/dockerfiles/Dockerfile.source
@@ -8,13 +8,14 @@ FROM mcr.microsoft.com/cbl-mariner/base/python:3
 MAINTAINER Changming Sun "chasun@microsoft.com"
 ADD . /code
 
-RUN tdnf install -y tar ca-certificates build-essential python3-numpy cmake python3-setuptools python3-wheel python3-pip curl python3-devel
+RUN tdnf install -y tar ca-certificates build-essential cmake curl python3-devel python3-setuptools python3-wheel python3-pip python3-numpy python3-flatbuffers python3-packaging python3-protobuf
+# The latest cmake version in Mariner2 is 3.21, but we need 3.26+
 RUN /code/dockerfiles/scripts/install_cmake.sh
 
 # Prepare onnxruntime repository & build onnxruntime
-RUN cd /code && python3 -m pip install -r tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt && /bin/bash ./build.sh --allow_running_as_root --skip_submodule_sync --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER)
+RUN cd /code && /bin/bash ./build.sh --allow_running_as_root --skip_submodule_sync --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER)
 
 FROM mcr.microsoft.com/cbl-mariner/base/python:3
 COPY --from=0 /code/build/Linux/Release/dist /root
 COPY --from=0 /code/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt
-RUN tdnf install -y  ca-certificates python3-setuptools python3-wheel python3-pip && python3 -m pip install /root/*.whl && rm -rf /root/*.whl
+RUN tdnf install -y ca-certificates python3-setuptools python3-wheel python3-pip python3-numpy python3-flatbuffers python3-packaging python3-protobuf python3-mpmath python3-sympy && python3 -m pip install coloredlogs humanfriendly && python3 -m pip install --no-index --find-links /root onnxruntime  && rm -rf /root/*.whl
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 7e67ec6d0c94e..1a76c18a6a8e0 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -47,6 +47,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.Inverse">com.microsoft.Inverse</a>
   * <a href="#com.microsoft.Irfft">com.microsoft.Irfft</a>
   * <a href="#com.microsoft.LongformerAttention">com.microsoft.LongformerAttention</a>
+  * <a href="#com.microsoft.MatMulBnb4">com.microsoft.MatMulBnb4</a>
   * <a href="#com.microsoft.MatMulFpQ4">com.microsoft.MatMulFpQ4</a>
   * <a href="#com.microsoft.MatMulInteger16">com.microsoft.MatMulInteger16</a>
   * <a href="#com.microsoft.MatMulIntegerToFloat">com.microsoft.MatMulIntegerToFloat</a>
@@ -90,6 +91,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.RemovePadding">com.microsoft.RemovePadding</a>
   * <a href="#com.microsoft.RestorePadding">com.microsoft.RestorePadding</a>
   * <a href="#com.microsoft.Rfft">com.microsoft.Rfft</a>
+  * <a href="#com.microsoft.RotaryEmbedding">com.microsoft.RotaryEmbedding</a>
   * <a href="#com.microsoft.SampleOp">com.microsoft.SampleOp</a>
   * <a href="#com.microsoft.Sampling">com.microsoft.Sampling</a>
   * <a href="#com.microsoft.SkipLayerNormalization">com.microsoft.SkipLayerNormalization</a>
@@ -2503,6 +2505,62 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.MatMulBnb4"></a><a name="com.microsoft.matmulbnb4">**com.microsoft.MatMulBnb4**</a>
+
+  MatMulBnb4 is a MatMul with weight quantized with 4 bits using either FP4 or NF4 data type (https://arxiv.org/pdf/2305.14314.pdf). It does Matrix Multiplication like MatMul (https://github.com/onnx/onnx/blob/main/docs/Operators.md#matmul) with differences:
+    1. Input B is a 2D constant Matrix. Its input feature count and output feature count are specified by attribute 'K' and 'N'.
+    2. Input B is quantized with 4 bits with quantization data type specified by attribute 'quant_type'. It is transposed, flattened and quantized blockwisely with block size specified by attribute 'block_size'.
+       And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,..
+    3. Input B's quantization constants or scales are specified by input 'absmax'.
+  
+  Input B is stored as uint8_t with shape: [(N * K + 1) / 2].
+  Input absmax is stored in same type as original type of B(float32, float16) with shape like: [(N * K + block_size - 1) / block_size].
+  
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>K</tt> : int (required)</dt>
+<dd>size of each input feature</dd>
+<dt><tt>N</tt> : int (required)</dt>
+<dd>size of each output feature</dd>
+<dt><tt>block_size</tt> : int (required)</dt>
+<dd>number of groupsize used for weight quantization. It needs to be a power of 2 and not smaller than 16.</dd>
+<dt><tt>quant_type</tt> : int (required)</dt>
+<dd>quantization data type. 0 for FP4, 1 for NF4.</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>A</tt> : T1</dt>
+<dd>The input tensor, not quantized</dd>
+<dt><tt>B</tt> : T2</dt>
+<dd>1-dimensional quantized data for weight</dd>
+<dt><tt>absmax</tt> : T1</dt>
+<dd>quantization constants</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : T1</dt>
+<dd>tensor. The output tensor has the same rank as the input. </dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(float), tensor(float16)</dt>
+<dd>Constrain input and output types to float/half_float tensors.</dd>
+<dt><tt>T2</tt> : tensor(uint8)</dt>
+<dd>Constrain quantized weight types to uint8.</dd>
+</dl>
+
+
 ### <a name="com.microsoft.MatMulFpQ4"></a><a name="com.microsoft.matmulfpq4">**com.microsoft.MatMulFpQ4**</a>
 
   Matrix product with right hand matrix being pre-packed and quantized int4 data blob.
@@ -2834,7 +2892,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>bias</tt> (optional) : T</dt>
 <dd>Bias tensor with shape (hidden_size + hidden_size + v_hidden_size) from input projection</dd>
 <dt><tt>key_padding_mask</tt> (optional) : M</dt>
-<dd>Key padding mask with shape (batch_size) or (3 * batch_size + 2) or (batch_size, kv_sequence_length)</dd>
+<dd>Key padding mask with shape (batch_size), (3 * batch_size + 2), (batch_size, kv_sequence_length), (batch_size, total_sequence_length), or (batch_size, sequence_length, total_sequence_length)</dd>
 <dt><tt>relative_position_bias</tt> (optional) : T</dt>
 <dd>relative position bias: addition to QxK' with shape (batch_size, num_heads, sequence_length, total_sequence_length) or (1, num_heads, sequence_length, total_sequence_length)</dd>
 <dt><tt>past_key</tt> (optional) : T</dt>
@@ -4796,6 +4854,54 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.RotaryEmbedding"></a><a name="com.microsoft.rotaryembedding">**com.microsoft.RotaryEmbedding**</a>
+
+  RotaryEmbedding is the implementation of rotary positional embeddings (RoPE). The positions are represented as rotation matrices 
+  that are multiplied to query and key before the inner product of query and key is taken.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>interleaved</tt> : int</dt>
+<dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
+<dt><tt>scale</tt> : float</dt>
+<dd>Custom scale will be used if specified. Default value is 1.0</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>input</tt> : T</dt>
+<dd>3D tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+<dt><tt>position_ids</tt> : M</dt>
+<dd>1D tensor with shape (1) or 2D tensor with shape (batch_size, sequence_length)</dd>
+<dt><tt>cos_cache</tt> : T</dt>
+<dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
+<dt><tt>sin_cache</tt> : T</dt>
+<dd>2D tensor with shape (max_sequence_length, head_size / 2).</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>3D tensor with shape (batch_size, sequence_length, hidden_size)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dd>Constrain input and output types to float tensors.</dd>
+<dt><tt>M</tt> : tensor(int64)</dt>
+<dd>Constrain input and output types to integer tensors</dd>
+</dl>
+
+
 ### <a name="com.microsoft.SampleOp"></a><a name="com.microsoft.sampleop">**com.microsoft.SampleOp**</a>
 
   Sample echo operator.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index e2d500006b05f..84249df92231b 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -25,6 +25,7 @@ Do not modify directly.*
 |||13|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[7, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |Affine|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
+|AffineGrid|*in* theta:**T1**<br> *in* size:**T2**<br> *out* grid:**T1**|20+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int64)|
 |And|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|7+|**T** = tensor(bool)<br/> **T1** = tensor(bool)|
 |ArgMax|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int8), tensor(uint8)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int8), tensor(uint8)|
@@ -156,8 +157,10 @@ Do not modify directly.*
 |||[1, 10]|**B** = tensor(bool)<br/> **V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |ImageScaler|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float)|
 |InstanceNormalization|*in* input:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *out* output:**T**|6+|**T** = tensor(float)|
-|IsInf|*in* X:**T1**<br> *out* Y:**T2**|10+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(bool)|
-|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|13+|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
+|IsInf|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|||[10, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(bool)|
+|IsNaN|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz)<br/> **T2** = tensor(bool)|
+|||[13, 19]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(float)|
 |||[1, 12]|**T** = tensor(float)|
@@ -454,6 +457,7 @@ Do not modify directly.*
 |GreedySearch|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *out* sequences:**I**|1+|**T** = tensor(float)|
 |GridSample|*in* X:**T1**<br> *in* Grid:**T1**<br> *out* Y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
 |Inverse|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
+|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)|
 |MatMulFpQ4|*in* A:**T1**<br> *in* B:**T2**<br> *in* B_shape:**T3**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)<br/> **T3** = tensor(int64)|
 |MatMulInteger16|*in* A:**T1**<br> *in* B:**T2**<br> *out* Y:**T3**|1+|**T1** = tensor(int16)<br/> **T2** = tensor(int16)<br/> **T3** = tensor(int32)|
 |MatMulIntegerToFloat|*in* A:**T1**<br> *in* B:**T2**<br> *in* a_scale:**T3**<br> *in* b_scale:**T3**<br> *in* a_zero_point:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T3**<br> *out* Y:**T3**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float)|
@@ -477,9 +481,11 @@ Do not modify directly.*
 |QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int16), tensor(int8), tensor(uint16), tensor(uint8)|
 |QuickGelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |Range|*in* start:**T**<br> *in* limit:**T**<br> *in* delta:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64)|
+|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(float)|
 |SampleOp|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |Sampling|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *in* presence_mask:**I**<br> *in* seed:**I**<br> *out* sequences:**I**<br> *out* filtered_logits:**T**|1+|**T** = tensor(float)|
 |SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(double), tensor(float)|
+|SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(double), tensor(float)|
 |SparseToDenseMatMul|*in* A:**T**<br> *in* B:**T1**<br> *out* Y:**T1**|1+|**T** = sparse_tensor(double), sparse_tensor(float), sparse_tensor(int32), sparse_tensor(int64), sparse_tensor(uint32), sparse_tensor(uint64)<br/> **T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
 |Tokenizer|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(string)|
 |TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
@@ -847,6 +853,7 @@ Do not modify directly.*
 |Inverse|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Irfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
 |NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|
@@ -866,6 +873,7 @@ Do not modify directly.*
 |RemovePadding|*in* input:**T**<br> *in* sequence_token_count:**M**<br> *out* output:**T**<br> *out* token_offset:**M**<br> *out* cumulated_seq_len:**M**<br> *out* max_seq_len:**M**|1+|**T** = tensor(float), tensor(float16)|
 |RestorePadding|*in* input:**T**<br> *in* token_offset:**M**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |Rfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
+|RotaryEmbedding|*in* input:**T**<br> *in* position_ids:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**|1+|**M** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
 |Sampling|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *in* presence_mask:**I**<br> *in* seed:**I**<br> *out* sequences:**I**<br> *out* filtered_logits:**T**|1+|**T** = tensor(float), tensor(float16)|
 |SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
 |SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/include/onnxruntime/core/framework/float8.h b/include/onnxruntime/core/framework/float8.h
index 0fd04f28d44b7..dd607cbbc6952 100644
--- a/include/onnxruntime/core/framework/float8.h
+++ b/include/onnxruntime/core/framework/float8.h
@@ -208,9 +208,10 @@ struct Float8E4M3FNUZ {
     val = static_cast<uint8_t>((b & 0x80000000) >> 24);  // sign
     if ((b & 0x7fffffff) == 0x7f800000) {                // infinity
       if (saturate) {
+        // the highest available value
         val |= 0x7F;
       } else {
-        // infinity
+        // NaN
         val = 0x80;
       }
     } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
@@ -362,8 +363,10 @@ struct Float8E5M2 {
     val = (b & 0x80000000) >> 24;          // sign
     if ((b & 0x7FFFFFFF) == 0x7F800000) {  // inf
       if (saturate) {
+        // the highest available value
         val |= 0x7B;
       } else {
+        // the infinity
         val |= 0x7C;
       }
     } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index f153e88909b8d..462d410e13769 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <functional>
 #include <limits>
 #include <memory>
 #include <string>
@@ -83,10 +84,10 @@ class Node {
        gsl::span<NodeArg* const> output_args,
        const NodeAttributes* attributes,
        std::string_view domain) {
-    Init(std::string{name}, std::string{op_type}, std::string{description},
-         std::vector<NodeArg*>{input_args.begin(), input_args.end()},
-         std::vector<NodeArg*>{output_args.begin(), output_args.end()},
-         attributes, std::string{domain});
+    Init(name, op_type, description,
+         input_args,
+         output_args,
+         attributes, domain);
   }
 #endif
 
@@ -563,13 +564,13 @@ class Node {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Node);
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
-  void Init(const std::string& name,
-            const std::string& op_type,
-            const std::string& description,
-            const std::vector<NodeArg*>& input_args,
-            const std::vector<NodeArg*>& output_args,
+  void Init(std::string_view name,
+            std::string_view op_type,
+            std::string_view description,
+            gsl::span<NodeArg* const> input_args,
+            gsl::span<NodeArg* const> output_args,
             const NodeAttributes* attributes,
-            const std::string& domain);
+            std::string_view domain);
 #endif
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1141,8 +1142,22 @@ class Graph {
   */
   Status InlineFunction(Node& node);
 
+  /**
+  Directly insert the nodes in the function proto provided into the graph.
+  The function converts Constant nodes into the initializers in the graph.
+  It then creates a node in the graph for each of the function nodes.
+  All of the names are expected to be specialized, and, therefore unique.
+  See function_utils::Specialize().
+
+  The Graph needs to be Resolve()d after this call.
+  @param func_to_inline
+  @returns Status indicating success or providing an error message.
+  */
+
+  Status InlineFunctionProto(const ONNX_NAMESPACE::FunctionProto& func_to_inline);
+
   /** Mark a NodeArg name as coming from the outer scope when programmatically constructing a Graph that will
-  be used as a GraphProto attribute in another Node..
+  be used as a GraphProto attribute in another Node.
   e.g. when creating a Graph instance that will be used as a subgraph in a control flow operator, it is necessary to
   define placeholder NodeArgs for outer scope values. This prevents these values from becoming explicit graph inputs
   when the Graph is resolved.
@@ -1391,6 +1406,13 @@ class Graph {
   Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
                 const ArgNameToTypeMap& name_to_type);
 
+  /** Helper that converts and adds constant node proto to an initializer in the graph.
+   @param constant_node_proto Constant node to convert
+   @param new_name use the new name for the initializer.
+  */
+  Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
+                                       std::optional<std::string_view> new_name);
+
 #endif
 
   Version IrVersion() const noexcept {
diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index 13c176dad3cc5..646f33ed952a4 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -19,6 +19,7 @@ struct CudaContext : public CustomOpContext {
   cudaStream_t cuda_stream = {};
   cudnnHandle_t cudnn_handle = {};
   cublasHandle_t cublas_handle = {};
+  OrtAllocator* deferred_cpu_allocator = {};
 
   void Init(const OrtKernelContext& kernel_ctx) override {
     const auto& ort_api = Ort::GetApi();
@@ -44,6 +45,36 @@ struct CudaContext : public CustomOpContext {
       ORT_CXX_API_THROW("failed to fetch cublas handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
     cublas_handle = reinterpret_cast<cublasHandle_t>(resource);
+
+    resource = {};
+    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::deferred_cpu_allocator_t, &resource);
+    if (status) {
+      ORT_CXX_API_THROW("failed to fetch deferred cpu allocator", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    deferred_cpu_allocator = reinterpret_cast<OrtAllocator*>(resource);
+  }
+
+  void* AllocDeferredCpuMem(size_t size) const {
+    if (0 == size) {
+      return {};
+    }
+    const auto& ort_api = Ort::GetApi();
+    void* mem = {};
+    auto status = ort_api.AllocatorAlloc(deferred_cpu_allocator, size, &mem);
+    if (status) {
+      ORT_CXX_API_THROW("failed to allocate deferred cpu memory", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return mem;
+  }
+
+  void FreeDeferredCpuMem(void* mem) const {
+    if (mem) {
+      const auto& ort_api = Ort::GetApi();
+      auto status = ort_api.AllocatorFree(deferred_cpu_allocator, mem);
+      if (status) {
+        ORT_CXX_API_THROW("failed to free deferred cpu memory", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+      }
+    }
   }
 };
 
diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h
index e46fc5b4219dd..8c3ed46ade6a1 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_resource.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -3,10 +3,11 @@
 
 #include "core/providers/resource.h"
 
-#define ORT_CUDA_RESOUCE_VERSION 1
+#define ORT_CUDA_RESOUCE_VERSION 2
 
 enum CudaResource : int {
   cuda_stream_t = cuda_resource_offset,
   cudnn_handle_t,
-  cublas_handle_t
+  cublas_handle_t,
+  deferred_cpu_allocator_t,
 };
\ No newline at end of file
diff --git a/include/onnxruntime/core/providers/dml/dml_provider_factory.h b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
index dd4ffb835d51c..cf3ddc3f125f9 100644
--- a/include/onnxruntime/core/providers/dml/dml_provider_factory.h
+++ b/include/onnxruntime/core/providers/dml/dml_provider_factory.h
@@ -128,7 +128,7 @@ struct OrtDmlApi {
   /**
    * SessionOptionsAppendExecutionProvider_DML2
    * Creates a DirectML Execution Provider given the supplied device options that contain a performance preference
-   * (high power, low power, or defult) and a device filter (None, GPU, or NPU).
+   * (high power, low power, or default) and a device filter (None, GPU, or NPU).
    */
   ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_DML2, _In_ OrtSessionOptions* options, OrtDmlDeviceOptions* device_opts);
 };
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 37545f41b43dd..831def24e4f5e 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -67,6 +67,16 @@ static const char* const kOrtSessionOptionsEnableQuantQDQCleanup = "session.enab
 // GeluApproximation has side effects which may change the inference results. It is disabled by default due to this.
 static const char* const kOrtSessionOptionsEnableGeluApproximation = "optimization.enable_gelu_approximation";
 
+// This setting controls whether to enable AheadOfTime function inlining.
+// AOT function inlining examines the graph and attempts to inline as many locally defined functions in the model
+// as possible with the help of enabled execution providers.
+// This can reduce the number of function calls and improve performance because it is done before
+// Level1 optimizers and constant folding. However, under some circumstances, when the EPs are not available,
+// one can disable the AOT inlining, produce an optimized model and postpone AOT until run time.
+// "0": enable; "1": disable.
+// Its default value is "0".
+static const char* const kOrtSessionOptionsDisableAheadOfTimeFunctionInlining = "session.disable_aot_function_inlining";
+
 #ifdef ENABLE_TRAINING
 // Specifies a list of op types for memory footprint reduction.
 // The value should be a ","-delimited list of pair of
diff --git a/js/web/docs/webgl-operators.md b/js/web/docs/webgl-operators.md
index de84134ddbb3f..7c129b66bfa3d 100644
--- a/js/web/docs/webgl-operators.md
+++ b/js/web/docs/webgl-operators.md
@@ -12,6 +12,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [Acos](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Acos) | [7+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Acos-7) |
 | [Acosh](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Acosh) |  |
 | [Add](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Add) | [7-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Add-7), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Add-13), [14+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Add-14) |
+| [AffineGrid](https://github.com/onnx/onnx/blob/main/docs/Operators.md#AffineGrid) |  |
 | [And](https://github.com/onnx/onnx/blob/main/docs/Operators.md#And) | [7+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#And-7) |
 | [ArgMax](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ArgMax) |  |
 | [ArgMin](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ArgMin) |  |
@@ -67,6 +68,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [Gather](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gather) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gather-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gather-11), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gather-13) |
 | [GatherElements](https://github.com/onnx/onnx/blob/main/docs/Operators.md#GatherElements) |  |
 | [GatherND](https://github.com/onnx/onnx/blob/main/docs/Operators.md#GatherND) |  |
+| [Gelu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gelu) |  |
 | [Gemm](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gemm) | [7-8](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gemm-7), [9-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gemm-9), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gemm-11), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Gemm-13) |
 | [GlobalAveragePool](https://github.com/onnx/onnx/blob/main/docs/Operators.md#GlobalAveragePool) | [1+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#GlobalAveragePool-1) |
 | [GlobalLpPool](https://github.com/onnx/onnx/blob/main/docs/Operators.md#GlobalLpPool) |  |
@@ -82,6 +84,7 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [Hardmax](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Hardmax) |  |
 | [Identity](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Identity) | [1-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-1), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-13), [14-15](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-14), [16-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-16), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Identity-19) |
 | [If](https://github.com/onnx/onnx/blob/main/docs/Operators.md#If) |  |
+| [ImageDecoder](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ImageDecoder) |  |
 | [InstanceNormalization](https://github.com/onnx/onnx/blob/main/docs/Operators.md#InstanceNormalization) | [6+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#InstanceNormalization-6) |
 | [IsInf](https://github.com/onnx/onnx/blob/main/docs/Operators.md#IsInf) |  |
 | [IsNaN](https://github.com/onnx/onnx/blob/main/docs/Operators.md#IsNaN) |  |
@@ -137,12 +140,13 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [ReduceL2](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceL2) |  |
 | [ReduceLogSum](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceLogSum) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceLogSum-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceLogSum-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceLogSum-13), [18+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceLogSum-18) |
 | [ReduceLogSumExp](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceLogSumExp) |  |
-| [ReduceMax](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceMax) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-1), [11](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-11), [12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-12), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-13), [18+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-18) |
+| [ReduceMax](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceMax) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-1), [11](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-11), [12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-12), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-13), [18-19](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-18), [20+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMax-20) |
 | [ReduceMean](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceMean) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMean-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMean-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMean-13), [18+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMean-18) |
-| [ReduceMin](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceMin) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-1), [11](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-11), [12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-12), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-13), [18+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-18) |
+| [ReduceMin](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceMin) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-1), [11](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-11), [12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-12), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-13), [18-19](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-18), [20+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceMin-20) |
 | [ReduceProd](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceProd) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceProd-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceProd-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceProd-13), [18+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceProd-18) |
 | [ReduceSum](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceSum) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSum-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSum-11) |
 | [ReduceSumSquare](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceSumSquare) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSumSquare-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSumSquare-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSumSquare-13), [18+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#ReduceSumSquare-18) |
+| [RegexFullMatch](https://github.com/onnx/onnx/blob/main/docs/Operators.md#RegexFullMatch) |  |
 | [Relu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Relu) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Relu-6), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Relu-13), [14+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Relu-14) |
 | [Reshape](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Reshape) | [5-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-5), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-13), [14-18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-14), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Reshape-19) |
 | [Resize](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize) | [10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-10), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-11), [13-17](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-13), [18](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-18), [19+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-19) |
@@ -179,7 +183,9 @@ See [Compatibility](../README.md#Compatibility) for a list of the supported plat
 | [SplitToSequence](https://github.com/onnx/onnx/blob/main/docs/Operators.md#SplitToSequence) |  |
 | [Sqrt](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sqrt) | [6-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sqrt-6), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sqrt-13) |
 | [Squeeze](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Squeeze) | [1-10](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-1), [11-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-11), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Squeeze-13) |
+| [StringConcat](https://github.com/onnx/onnx/blob/main/docs/Operators.md#StringConcat) |  |
 | [StringNormalizer](https://github.com/onnx/onnx/blob/main/docs/Operators.md#StringNormalizer) |  |
+| [StringSplit](https://github.com/onnx/onnx/blob/main/docs/Operators.md#StringSplit) |  |
 | [Sub](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sub) | [7-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sub-7), [13](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sub-13), [14+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sub-14) |
 | [Sum](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sum) | [6-7](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sum-6), [8-12](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sum-8), [13+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Sum-13) |
 | [Tan](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Tan) | [7+](https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Tan-7) |
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 0a64d1ad1792a..1d3fc78fe368a 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -803,3 +803,6 @@ export const getBroadcastDims = (inShape: readonly number[], outShape: readonly
   }
   return dims;
 };
+
+// TODO: remove this limitation once >4D dims are supported by uniform.
+export const enableShapesUniforms = (rank: number): boolean => rank <= 4;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
index d241b8b92a669..e880afe09a5d8 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
@@ -232,7 +232,7 @@ const convTranspose2d =
       // STEP.1: transpose weight
       const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
           context.compute(
-              createTransposeProgramInfo(inputs[1].dataType, inputs[1].dims.length, weightTransposePerm),
+              createTransposeProgramInfo(inputs[1], weightTransposePerm),
               {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
       if (attributes.wIsConst && !context.kernelCustomData.wT) {
         context.kernelCustomData.wT = transposedWeight;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index b323a36cee5c8..c7ea0cffe51c3 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -168,7 +168,7 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
     if (isChannelsLast) {
       const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
           context.compute(
-              createTransposeProgramInfo(inputs[1].dataType, inputs[1].dims.length, weightTransposeAttribute),
+              createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
               {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
       if (attributes.wIsConst && !context.kernelCustomData.wT) {
         context.kernelCustomData.wT = transposedWeight;
@@ -208,7 +208,7 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   // STEP.1: transpose weight
   const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
       context.compute(
-          createTransposeProgramInfo(inputs[1].dataType, inputs[1].dims.length, weightTransposeAttribute),
+          createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
           {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
   if (attributes.wIsConst && !context.kernelCustomData.wT) {
     context.kernelCustomData.wT = transposedWeight;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 05f02b07c4d89..1538644412afd 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -18,16 +18,18 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs || inputs.length !== 1) {
     throw new Error('Pool ops requires 1 input.');
   }
-  if (inputs[0].dims.length !== 4) {
-    throw new Error('Pool ops supports 2-D inputs only for now.');
+  if (inputs[0].dims.length !== 4 && inputs[0].dims.length !== 3) {
+    throw new Error('Pool ops supports 1-D or 2-D inputs only for now.');
   }
 };
 
 const getAdjustedPoolAttributesAndOutputShape = <AttributeType extends AveragePoolAttributes|MaxPoolAttributes>(
     input: TensorView, attributes: AttributeType, isGlobalOperator: boolean): [AttributeType, number[]] => {
   const isChannelsLast = attributes.format === 'NHWC';
-  const inputShapeAsChannelFirst =
-      isChannelsLast ? [input.dims[0], input.dims[3], input.dims[1], input.dims[2]] : input.dims.slice();
+  const inputShapeAsChannelFirst = input.dims.slice();
+  if (isChannelsLast) {
+    inputShapeAsChannelFirst.splice(1, 0, inputShapeAsChannelFirst.pop()!);  // Move channel to the second position.
+  }
   const hasDilations = Object.hasOwnProperty.call(attributes, 'dilations');
   const kernelShape = attributes.kernelShape.slice();
   const strides = attributes.strides.slice();
@@ -44,15 +46,9 @@ const getAdjustedPoolAttributesAndOutputShape = <AttributeType extends AveragePo
   } else {
     Object.assign(newAttributes, {kernelShape, strides, pads, cacheKey: attributes.cacheKey});
   }
-  return [
-    newAttributes,
-    isChannelsLast ?
-        [
-          outputShapeAsChannelFirst[0], outputShapeAsChannelFirst[2], outputShapeAsChannelFirst[3],
-          outputShapeAsChannelFirst[1]
-        ] :
-        outputShapeAsChannelFirst
-  ];
+  const outputShapeAsChannelLast = outputShapeAsChannelFirst.slice();
+  outputShapeAsChannelLast.push(outputShapeAsChannelLast.splice(1, 1)[0]);
+  return [newAttributes, isChannelsLast ? outputShapeAsChannelLast : outputShapeAsChannelFirst];
 };
 
 const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPoolAttributes>(
@@ -76,22 +72,22 @@ const generatePoolingCode = <AttributeType extends AveragePoolAttributes|MaxPool
     let codeHEnd = '';
     if (pwStart + pwEnd !== 0) {
       codeW = `
-              for (var i: u32 = 0u; i < ${kw}u; i++) {
-                xIndices[${dimIdxW}] = indices[${dimIdxW}] * ${sw} - ${pwStart} + i;
-                if (xIndices[${dimIdxW}] < 0 || xIndices[${dimIdxW}] >= ${inputDims[dimIdxW]}) {
-                  pad++;
-                  continue;
-                }
-                let x_val = x[${x.indicesToOffset('xIndices')}];
-                ${op1}
-              }`;
+                for (var i: u32 = 0u; i < ${kw}u; i++) {
+                  xIndices[${dimIdxW}] = indices[${dimIdxW}] * ${sw} - ${pwStart} + i;
+                  if (xIndices[${dimIdxW}] < 0 || xIndices[${dimIdxW}] >= ${inputDims[dimIdxW]}) {
+                    pad++;
+                    continue;
+                  }
+                  let x_val = x[${x.indicesToOffset('xIndices')}];
+                  ${op1}
+                }`;
     } else {
       codeW = `
-              for (var i: u32 = 0u; i < ${kw}u; i++) {
-                xIndices[${dimIdxW}] = indices[${dimIdxW}] * ${sw} - ${pwStart} + i;
-                let x_val = x[${x.indicesToOffset('xIndices')}];
-                ${op1}
-              }`;
+                for (var i: u32 = 0u; i < ${kw}u; i++) {
+                  xIndices[${dimIdxW}] = indices[${dimIdxW}] * ${sw} - ${pwStart} + i;
+                  let x_val = x[${x.indicesToOffset('xIndices')}];
+                  ${op1}
+                }`;
     }
 
     if (attributes.kernelShape.length === 2) {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index fe556a7fd8552..c4d43e9f466f5 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -6,7 +6,7 @@ import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, ProgramInfo} from '../types';
 
-import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, enableShapesUniforms, IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
 
 export interface TransposeAttributes extends AttributeWithCacheKey {
   readonly perm: number[];
@@ -35,13 +35,18 @@ const permFunctionBody = (perm: number[], rank: number, input: IndicesHelper, ou
   return reverseFunc.join('\n');
 };
 
-export const createTransposeProgramInfo =
-    (inputDataType: number, inputRank: number, permAttr: number[]): ProgramInfo => {
-      const perm = getAdjustedPerm(inputRank, permAttr);
-      const output = outputVariable('output', inputDataType, (permAttr && permAttr.length) || inputRank);
-      const input = inputVariable('a', inputDataType, inputRank);
+export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: number[]): ProgramInfo => {
+  const inputDataType = inputTensor.dataType;
+  const inputRank = inputTensor.dims.length;
+  const perm = getAdjustedPerm(inputRank, permAttr);
+  const useShapesUniforms = enableShapesUniforms(inputRank);
+  const outputShape = getOutputShape(inputTensor.dims, perm);
+  const outShapeOrRank = useShapesUniforms ? outputShape.length : outputShape;
+  const inShapeOrRank = useShapesUniforms ? inputRank : inputTensor.dims;
+  const output = outputVariable('output', inputDataType, outShapeOrRank);
+  const input = inputVariable('a', inputDataType, inShapeOrRank);
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
+  const getShaderSource = (shaderHelper: ShaderHelper) => `
   ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)}
 
   ${permFunctionBody(perm, inputRank, input, output)}
@@ -54,30 +59,32 @@ export const createTransposeProgramInfo =
 
     ${output.setByOffset('global_idx', input.getByIndices('aIndices'))}
   }`;
+  return {
+    name: 'Transpose',
+    shaderCache: {hint: `${permAttr}`, inputDependencies: useShapesUniforms ? ['rank'] : ['dims']},
+    getRunData: (inputs) => {
+      const outputSize = ShapeUtil.size(outputShape);
       return {
-        name: 'Transpose',
-        shaderCache: {hint: `${permAttr}`, inputDependencies: ['rank']},
-        getRunData: (inputs) => {
-          const outputShape = getOutputShape(inputs[0].dims, perm);
-          const outputSize = ShapeUtil.size(outputShape);
-          return {
-            outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-            dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
-            programUniforms: [
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+        dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+        programUniforms: useShapesUniforms ?
+            [
               {type: 'uint32', data: outputSize},
               ...createTensorShapeVariables(inputs[0].dims),
               ...createTensorShapeVariables(outputShape),
+            ] :
+            [
+              {type: 'uint32', data: outputSize},
             ],
-          };
-        },
-        getShaderSource,
       };
-    };
+    },
+    getShaderSource,
+  };
+};
 
 export const transpose = (context: ComputeContext, attributes: TransposeAttributes): void => {
   validateInputs(context.inputs);
-  context.compute(
-      createTransposeProgramInfo(context.inputs[0].dataType, context.inputs[0].dims.length, attributes.perm));
+  context.compute(createTransposeProgramInfo(context.inputs[0], attributes.perm));
 };
 
 export const parseTransposeAttributes = (attributes: Record<string, unknown>): TransposeAttributes =>
diff --git a/js/web/test/data/ops/transpose.jsonc b/js/web/test/data/ops/transpose.jsonc
index 285d14018e74d..e1edfa7e41513 100644
--- a/js/web/test/data/ops/transpose.jsonc
+++ b/js/web/test/data/ops/transpose.jsonc
@@ -166,5 +166,29 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Transpose 5D - perms:[4, 3, 1, 0, 2]",
+    "operator": "Transpose",
+    "attributes": [{ "name": "perm", "data": [4, 3, 1, 0, 2], "type": "ints" }],
+    "cases": [
+      {
+        "name": "T[3, 1, 2, 1, 4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
+            "dims": [3, 1, 2, 1, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 5, 9, 13, 17, 21, 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23, 4, 8, 12, 16, 20, 24],
+            "dims": [4, 1, 1, 3, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
index 0b55cb7804c61..694c40bf3eda6 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
@@ -16,7 +16,6 @@
 
 #include <unsupported/Eigen/SpecialFunctions>
 #include <vector>
-#include <iostream>
 
 using onnxruntime::concurrency::ThreadPool;
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
index 73b83057bdbe9..00e82c9844b3d 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
@@ -206,6 +206,7 @@ Status CheckInputs(const T* query,
     }
   }
 
+  int total_sequence_length = past_sequence_length + kv_sequence_length;
   AttentionMaskType mask_type = AttentionMaskType::MASK_NONE;
   if (key_padding_mask != nullptr) {
     mask_type = AttentionMaskType::MASK_UNKNOWN;
@@ -216,13 +217,21 @@ Status CheckInputs(const T* query,
       } else if (mask_dims[0] == static_cast<int64_t>(3) * static_cast<int64_t>(batch_size) + static_cast<int64_t>(2)) {
         mask_type = AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START;
       }
-    } else if (mask_dims.size() == 2 && mask_dims[0] == static_cast<int64_t>(batch_size) && mask_dims[1] == static_cast<int64_t>(kv_sequence_length)) {
+    } else if (mask_dims.size() == 2 && mask_dims[0] == static_cast<int64_t>(batch_size) &&
+               mask_dims[1] == static_cast<int64_t>(kv_sequence_length)) {
+      mask_type = AttentionMaskType::MASK_2D_KEY_PADDING;
+    } else if (mask_dims.size() == 2 && mask_dims[0] == static_cast<int64_t>(batch_size) &&
+               mask_dims[1] == static_cast<int64_t>(total_sequence_length)) {
       mask_type = AttentionMaskType::MASK_2D_KEY_PADDING;
+    } else if (mask_dims.size() == 3 && mask_dims[0] == static_cast<int64_t>(batch_size) &&
+               mask_dims[1] == static_cast<int64_t>(sequence_length) &&
+               mask_dims[2] == static_cast<int64_t>(total_sequence_length)) {
+      mask_type = AttentionMaskType::MASK_3D_ATTENTION;
     }
 
     if (mask_type == AttentionMaskType::MASK_UNKNOWN) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Input 'key_padding_mask' shape shall be (batch_size) or (batch_size, kv_sequence_length)");
+                             "Input 'key_padding_mask' shape shall be 1D, 2D, or 3D");
     }
   }
 
@@ -257,7 +266,6 @@ Status CheckInputs(const T* query,
     }
   }
 
-  int total_sequence_length = past_sequence_length + kv_sequence_length;
   bool broadcast_res_pos_bias = false;
   if (relative_position_bias != nullptr) {
     const auto& relative_position_bias_dims = relative_position_bias->Shape().GetDims();
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
new file mode 100644
index 0000000000000..4a266af789250
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.cc
@@ -0,0 +1,115 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/cpu/bert/rotary_embedding.h"
+#include "contrib_ops/cpu/bert/rotary_embedding_helper.h"
+
+#include "core/platform/threadpool.h"
+
+using onnxruntime::concurrency::ThreadPool;
+using namespace onnxruntime::contrib::rotary_embedding_helper;
+
+namespace onnxruntime {
+namespace contrib {
+
+// These ops are internal-only, so register outside of onnx
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    RotaryEmbedding,
+    kMSDomain,
+    1,
+    float,
+    kCpuExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>()),
+    RotaryEmbedding<float>);
+
+template <typename T>
+RotaryEmbedding<T>::RotaryEmbedding(const OpKernelInfo& info) : OpKernel(info) {
+  scale = info.GetAttrOrDefault<float>("scale", 1.0);
+  interleaved = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1);
+}
+
+template <typename T>
+Status RotaryEmbedding<T>::Compute(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const Tensor* position_ids = context->Input<Tensor>(1);
+  const Tensor* cos_cache = context->Input<Tensor>(2);
+  const Tensor* sin_cache = context->Input<Tensor>(3);
+
+  RotaryParameters parameters = {};
+  ORT_RETURN_IF_ERROR(rotary_embedding_helper::CheckInputs<Tensor>(input,
+                                                                   position_ids,
+                                                                   cos_cache,
+                                                                   sin_cache,
+                                                                   &parameters));
+
+  Tensor* output = context->Output(0, input->Shape());
+
+  if (parameters.sequence_length > parameters.max_sequence_length) {
+    // Launch update_cos_sin_cache kernel with scale
+    ORT_NOT_IMPLEMENTED("Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported");
+  }
+
+  const T* input_src = input->Data<T>();
+  const int64_t* pos_ids_data = position_ids->Data<int64_t>();
+  const T* cos_cache_data = cos_cache->Data<T>();
+  const T* sin_cache_data = sin_cache->Data<T>();
+  T* output_dest = output->MutableData<T>();
+
+  const int batch_size = parameters.batch_size;
+  const int sequence_length = parameters.sequence_length;
+  const int num_heads = parameters.num_heads;
+  const int head_size = parameters.head_size;
+  const int position_ids_format = parameters.position_ids_format;
+  const int half_head_size = head_size / 2;
+
+  AllocatorPtr allocator;
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
+  auto* tp = context->GetOperatorThreadPool();
+
+  const int loop_len = batch_size * sequence_length * num_heads;
+  const double cost = static_cast<double>(head_size);
+  ThreadPool::TryParallelFor(tp, loop_len, cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+    for (std::ptrdiff_t ptr = begin; ptr != end; ++ptr) {
+      const int b = static_cast<int>((ptr / num_heads) / sequence_length);
+      const int s = static_cast<int>((ptr / num_heads) % sequence_length);
+      const int n = static_cast<int>(ptr % num_heads);
+
+      const int block_offset = b * sequence_length * num_heads + s * num_heads + n;
+      const int data_offset = block_offset * head_size;
+
+      const T* input_data = input_src + data_offset;
+      T* output_data = output_dest + data_offset;
+
+      // Cache is (M, H/2)
+      const int position_id = (position_ids_format == 0)
+                                  ? static_cast<int>(pos_ids_data[0]) + s
+                                  : static_cast<int>(pos_ids_data[b * sequence_length + s]);
+      const int cache_offset = position_id * half_head_size;
+      const T* cos_data = cos_cache_data + cache_offset;
+      const T* sin_data = sin_cache_data + cache_offset;
+
+      int cache_idx = 0;
+      T sign = 0;
+      int j = 0;
+      for (int i = 0; i < head_size; i++) {
+        if (interleaved) {
+          cache_idx = (i / 2) % half_head_size;
+          sign = (i % 2 == 0) ? static_cast<T>(-1) : static_cast<T>(1);
+          j = (i % 2 == 0) ? i + 1 : i - 1;  // i - sign
+        } else {
+          cache_idx = i % half_head_size;
+          sign = (i < half_head_size) ? static_cast<T>(-1) : static_cast<T>(1);
+          j = (i + half_head_size) % head_size;
+        }
+        output_data[i] = input_data[i] * cos_data[cache_idx] + sign * input_data[j] * sin_data[cache_idx];
+      }
+    }
+  });
+
+  return Status::OK();
+}
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h
new file mode 100644
index 0000000000000..be834a66cdc69
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename T>
+class RotaryEmbedding final : public OpKernel {
+ public:
+  RotaryEmbedding(const OpKernelInfo& info);
+  Status Compute(OpKernelContext* context) const override;
+
+ protected:
+  float scale;
+  bool interleaved;
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h
new file mode 100644
index 0000000000000..cf8080800e072
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_embedding_helper.h
@@ -0,0 +1,121 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace rotary_embedding_helper {
+
+// Parameters deduced from node attributes and inputs/outputs.
+struct RotaryParameters {
+  int batch_size;           // Batch size used by input
+  int sequence_length;      // Sequence length used by input
+  int hidden_size;          // Hidden size used by input
+  int head_size;            // Head size used by cos/sin cache * 2
+  int num_heads;            // num_heads = hidden_size / head_size
+  int max_sequence_length;  // Sequence length used by cos/sin cache
+  int position_ids_format;  // Format of position ids - 0 is (1), 1 is (batch_size, sequence_length)
+};
+
+template <typename T>
+Status CheckInputs(const T* input,
+                   const T* position_ids,
+                   const T* cos_cache,
+                   const T* sin_cache,
+                   void* parameters) {
+  //    input        : (batch_size, sequence_length, hidden_size)
+  //    position ids : (1) or (batch_size, sequence_length)
+  //    cos cache    : (max_sequence_length, head_size / 2)
+  //    sin cache    : (max_sequence_length, head_size / 2)
+
+  // Check input
+  const auto& input_dims = input->Shape().GetDims();
+  if (input_dims.size() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'x' is expected to have 3 dimensions, got ",
+                           input_dims.size());
+  }
+  // Check position_ids
+  const auto& position_ids_dims = position_ids->Shape().GetDims();
+  if (!onnxruntime::IsScalarOr1ElementVector(position_ids) && position_ids_dims.size() != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'position_ids' is expected to have 0, 1, or 2 ",
+                           "dimensions, got ", position_ids_dims.size());
+  }
+  // Check cos_cache and sin_cache
+  const auto& cos_cache_dims = cos_cache->Shape().GetDims();
+  if (cos_cache_dims.size() != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'cos_cache' is expected to have 2 dimensions, got ",
+                           cos_cache_dims.size());
+  }
+  const auto& sin_cache_dims = sin_cache->Shape().GetDims();
+  if (sin_cache_dims.size() != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'sin_cache' is expected to have 2 dimensions, got ",
+                           sin_cache_dims.size());
+  }
+  if (cos_cache_dims[0] != sin_cache_dims[0] || cos_cache_dims[1] != sin_cache_dims[1]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'cos_cache' and 'sin_cache' are expected to have ",
+                           "the same shape");
+  }
+
+  // Get attributes from inputs
+  int batch_size = static_cast<int>(input_dims[0]);
+  int sequence_length = static_cast<int>(input_dims[1]);
+  int hidden_size = static_cast<int>(input_dims[2]);
+  int max_sequence_length = static_cast<int>(cos_cache_dims[0]);
+  int head_size = static_cast<int>(cos_cache_dims[1]) * 2;
+  int num_heads = hidden_size / head_size;
+  int position_ids_format = -1;
+
+  // Check position_ids input shapes
+  if (!onnxruntime::IsScalarOr1ElementVector(position_ids)) {
+    if (batch_size != static_cast<int>(position_ids_dims[0])) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'position_ids' dimension 0 should be of size ",
+                             "batch_size, got ", position_ids_dims[0]);
+    }
+    if (sequence_length != static_cast<int>(position_ids_dims[1])) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'position_ids' dimension 1 should be of size ",
+                             "sequence_length, got ", position_ids_dims[1]);
+    }
+    position_ids_format = 1;
+  } else {
+    position_ids_format = 0;
+  }
+  // Check cos_cache input shapes
+  if (max_sequence_length != static_cast<int>(cos_cache_dims[0])) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'cos_cache' dimension 0 should be same as ",
+                           "max_sequence_length, got ", cos_cache_dims[0]);
+  }
+  if ((head_size / 2) != static_cast<int>(cos_cache_dims[1])) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'cos_cache' dimension 1 should be same as ",
+                           "head_size / 2, got ", cos_cache_dims[1]);
+  }
+  // Check sin_cache input shapes
+  if (max_sequence_length != static_cast<int>(sin_cache_dims[0])) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'sin_cache' dimension 0 should be same as ",
+                           "max_sequence_length, got ", sin_cache_dims[0]);
+  }
+  if ((head_size / 2) != static_cast<int>(sin_cache_dims[1])) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'sin_cache' dimension 1 should be same as ",
+                           "head_size / 2, got ", sin_cache_dims[1]);
+  }
+
+  // Set rotary parameters
+  if (parameters != nullptr) {
+    RotaryParameters* output_parameters = reinterpret_cast<RotaryParameters*>(parameters);
+    output_parameters->batch_size = batch_size;
+    output_parameters->sequence_length = sequence_length;
+    output_parameters->hidden_size = hidden_size;
+    output_parameters->head_size = head_size;
+    output_parameters->num_heads = num_heads;
+    output_parameters->max_sequence_length = max_sequence_length;
+    output_parameters->position_ids_format = position_ids_format;
+  }
+
+  return Status::OK();
+}
+
+}  // namespace rotary_embedding_helper
+}  // namespace contrib
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index b4c51ab290eb7..f9d9b13f0fedc 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -20,6 +20,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer);
@@ -29,6 +30,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gathe
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, TransposeMatMul);  // backward compatibility
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FusedMatMul);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulNBits);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulBnb4);
 #ifndef ORT_MINIMAL_BUILD
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulFpQ4);
 #endif
@@ -124,6 +126,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, SimplifiedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipSimplifiedLayerNormalization);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Inverse);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu);
 
@@ -253,6 +257,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>,
@@ -266,6 +271,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, TransposeMatMul)>,  // backward compatibility
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FusedMatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulNBits)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulBnb4)>,
 #ifndef ORT_MINIMAL_BUILD
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulFpQ4)>,
 #endif
@@ -299,6 +305,8 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, SimplifiedLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipSimplifiedLayerNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Inverse)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu)>,
 
diff --git a/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h b/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h
new file mode 100644
index 0000000000000..cb8e97a592d8c
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h
@@ -0,0 +1,202 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+#include <algorithm>
+#include <cmath>
+
+namespace onnxruntime {
+namespace contrib {
+
+#if defined(_MSC_VER)
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE __attribute__((always_inline)) inline
+#endif
+
+typedef enum Bnb_DataType_t {
+  FP4 = 0,
+  NF4 = 1,
+} Bnb_DataType_t;
+
+FORCEINLINE uint8_t QuantizeOneFP4(float x) {
+  // FP4 with bias of 3
+  // first bit is a sign
+  // subnormals
+  // 0b000 = 0
+  // 0b001 = 0.0625
+  // 0b110 = 2
+  // 0b111 = 3
+  // 0b100 = 4
+  // 0b101 = 6
+  // 0b010 = 8
+  // 0b011 = 12
+
+  // we do a binary search
+  // the pivots are divided by 12 (the FP4 absmax)
+  // since we assum input data is in [-1.0, 1.0]
+
+  // !be careful here, its easy to make a mistake
+  // that is difficult to noice if you add an extra
+  // zero somewhere!
+
+  uint8_t sign = x < 0 ? 0b1000 : 0b0000;
+  x = fabsf(x);
+  if (x > 0.29166667f) {
+    if (x > 0.583333f) {
+      if (x > 0.8333333f) {
+        return 0b0011 + sign;
+      } else {
+        return 0b0010 + sign;
+      }
+    } else if (x > 0.4166667f) {
+      return 0b101 + sign;
+    } else {
+      return 0b100 + sign;
+    }
+  } else if (x > 0.0859375f) {
+    if (x > 0.20833333f) {
+      return 0b0111 + sign;
+    } else {
+      return 0b0110 + sign;
+    }
+  } else if (x > 0.00260417f) {
+    return 0b0001 + sign;
+  } else {
+    return 0b0000 + sign;
+  }
+}
+
+FORCEINLINE uint8_t QuantizeOneNF4(float x) {
+  if (x > 0.03979014977812767f) {
+    if (x > 0.3893125355243683f) {      // 1
+      if (x > 0.6427869200706482f) {    // 11
+        if (x > 0.8614784181118011f) {  // 111
+          return 0b1111;
+        } else {
+          return 0b1110;
+        }
+      } else if (x > 0.5016634166240692f) {  // 110
+        return 0b1101;
+      } else {
+        return 0b1100;
+      }
+    } else if (x > 0.2035212516784668f) {  // 10
+      if (x > 0.2920137718319893f) {       // 101
+        return 0b1011;
+      } else {
+        return 0b1010;
+      }
+    } else if (x > 0.1202552504837513f) {  // 100
+      return 0b1001;
+    } else {
+      return 0b1000;
+    }
+  } else if (x > -0.33967943489551544f) {  // 0
+    if (x > -0.13791173323988914f) {       // 01
+      if (x > -0.045525018125772476f) {    // 011
+        return 0b0111;
+      } else {
+        return 0b0110;
+      }
+    } else if (x > -0.23460740596055984f) {  // 010
+      return 0b0101;
+    } else {
+      return 0b0100;
+    }
+  } else if (x > -0.6106329262256622f) {  // 00
+    if (x > -0.4599952697753906f) {       // 001
+      return 0b0011;
+    } else {
+      return 0b0010;
+    }
+  } else if (x > -0.8480964004993439f) {  // 000
+    return 0b0001;
+  } else {
+    return 0b0000;
+  }
+}
+
+template <int32_t DATA_TYPE>
+FORCEINLINE uint8_t QuantizeOneBnb4(float x) {
+  if constexpr (DATA_TYPE == FP4)
+    return QuantizeOneFP4(x);
+  else
+    return QuantizeOneNF4(x);
+}
+
+template <typename T, int32_t block_size, int32_t DATA_TYPE>
+FORCEINLINE void QuantizeBlockBnb4(const T* src, uint8_t* dst, T& absmax_block, int32_t block_idx, int32_t numel) {
+  float local_absmax = 0.0f;
+
+  int32_t block_len = std::min(block_size, numel - block_idx * block_size);
+  int32_t src_offset = block_idx * block_size;
+  int32_t dst_offset = block_idx * block_size / 2;
+
+  for (int32_t idx = 0; idx < block_len; idx++) {
+    const float v = static_cast<float>(src[src_offset + idx]);
+    local_absmax = fmaxf(local_absmax, fabsf(v));
+  }
+
+  absmax_block = static_cast<T>(local_absmax);
+  const float reciprocal_absmax = local_absmax ? 1.0f / local_absmax : 0.0f;
+
+  for (int32_t idx = 0; idx < block_len; idx += 2) {
+    const float v0 = static_cast<float>(src[src_offset + idx]) * reciprocal_absmax;
+    const uint8_t vi0 = QuantizeOneBnb4<DATA_TYPE>(v0);
+
+    const float v1 = (idx + 1 < block_len) ? static_cast<float>(src[src_offset + idx + 1]) * reciprocal_absmax : 0;
+    const uint8_t vi1 = QuantizeOneBnb4<DATA_TYPE>(v1);
+
+    dst[dst_offset + idx / 2] = (vi0 << 4) | vi1;
+  }
+}
+
+static float fp4_qaunt_map[16] = {0.00000000f, 5.208333333e-03f, 0.66666667f, 1.00000000f,
+                                  0.33333333f, 0.50000000f, 0.16666667f, 0.25000000f,
+                                  -0.00000000f, -5.208333333e-03f, -0.66666667f, -1.00000000f,
+                                  -0.33333333f, -0.50000000f, -0.16666667f, -0.25000000f};
+
+static float nf4_qaunt_map[16] = {-1.0f,
+                                  -0.6961928009986877f,
+                                  -0.5250730514526367f,
+                                  -0.39491748809814453f,
+                                  -0.28444138169288635f,
+                                  -0.18477343022823334f,
+                                  -0.09105003625154495f,
+                                  0.0f,
+                                  0.07958029955625534f,
+                                  0.16093020141124725f,
+                                  0.24611230194568634f,
+                                  0.33791524171829224f,
+                                  0.44070982933044434f,
+                                  0.5626170039176941f,
+                                  0.7229568362236023f,
+                                  1.0f};
+
+template <typename T, int32_t DATA_TYPE>
+FORCEINLINE T DequantizeOneBnb4(uint8_t x) {
+  if constexpr (DATA_TYPE == FP4)
+    return static_cast<T>(fp4_qaunt_map[x]);
+  else
+    return static_cast<T>(nf4_qaunt_map[x]);
+}
+
+template <typename T, int32_t block_size, int32_t DATA_TYPE>
+FORCEINLINE void DequantizeBlockBnb4(const uint8_t* src, T* dst, T absmax_block, int32_t block_idx, int32_t numel) {
+  int32_t block_len = std::min(block_size, numel - block_idx * block_size);
+  int32_t src_offset = block_idx * block_size / 2;
+  int32_t dst_offset = block_idx * block_size;
+
+  for (int32_t idx = 0; idx < block_len; idx += 2) {
+    const uint8_t val = src[src_offset + idx / 2];
+
+    dst[dst_offset + idx] = DequantizeOneBnb4<T, DATA_TYPE>(val >> 4) * absmax_block;
+    if (idx + 1 < block_len) dst[dst_offset + idx + 1] = DequantizeOneBnb4<T, DATA_TYPE>(val & 0xF) * absmax_block;
+  }
+}
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h b/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h
new file mode 100644
index 0000000000000..5ddb77e5b5ee3
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h
@@ -0,0 +1,143 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "blockwise_quant_block_bnb4.h"
+
+#include <vector>
+
+#include "core/common/safeint.h"
+#include "core/framework/float16.h"
+#include "core/platform/threadpool.h"
+#include <iostream>
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename T, int32_t block_size, int32_t DATA_TYPE>
+void QuantizeBlockwiseBnb4(
+    uint8_t* dst,  // shape: [(N * K + 1) / 2]
+    const T* src,  // shape: [N, K]
+    T* absmax,     // shape: [(N * K + block_size - 1) / block_size]
+    int32_t N,
+    int32_t K,
+    onnxruntime::concurrency::ThreadPool* thread_pool) {
+  int32_t numel = N * K;
+  int32_t total_block_count = (numel + block_size - 1) / block_size;
+
+  concurrency::ThreadPool::TryBatchParallelFor(
+      thread_pool,
+      total_block_count,
+      [&](ptrdiff_t block_idx) {
+        QuantizeBlockBnb4<T, block_size, DATA_TYPE>(
+            src,
+            dst,
+            absmax[block_idx],
+            static_cast<int32_t>(block_idx),
+            numel);
+      },
+      0);
+}
+
+#define QuantizeBlockwiseBn4DataTyped(block_size, quant_type)                       \
+  if (quant_type == FP4)                                                            \
+    QuantizeBlockwiseBnb4<T, block_size, FP4>(dst, src, absmax, N, K, thread_pool); \
+  else                                                                              \
+    QuantizeBlockwiseBnb4<T, block_size, NF4>(dst, src, absmax, N, K, thread_pool);
+
+template <typename T>
+void QuantizeBlockwiseBnb4(
+    uint8_t* dst,  // shape: [(N * K + 1) / 2]
+    const T* src,  // shape: [N, K]
+    T* absmax,     // shape: [(N * K + block_size - 1) / block_size]
+    int32_t block_size,
+    int32_t quant_type,
+    int32_t N,
+    int32_t K,
+    onnxruntime::concurrency::ThreadPool* thread_pool) {
+  ORT_ENFORCE(
+      quant_type == FP4 || quant_type == NF4,
+      "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+
+  if (block_size == 16) {
+    QuantizeBlockwiseBn4DataTyped(16, quant_type);
+  } else if (block_size == 32) {
+    QuantizeBlockwiseBn4DataTyped(32, quant_type);
+  } else if (block_size == 64) {
+    QuantizeBlockwiseBn4DataTyped(64, quant_type);
+  } else if (block_size == 128) {
+    QuantizeBlockwiseBn4DataTyped(128, quant_type);
+  } else if (block_size == 256) {
+    QuantizeBlockwiseBn4DataTyped(256, quant_type);
+  } else {
+    ORT_NOT_IMPLEMENTED("only block size 16, 32, 64, 128, 256 are supported.");
+  }
+}
+
+#undef QuantizeBlockwiseBn4DataTyped
+
+template <typename T, int32_t block_size, int32_t DATA_TYPE>
+void DequantizeBlockwiseBnb4(
+    T* dst,              // shape: [N, K]
+    const uint8_t* src,  // shape: [(N * K + 1) / 2)]
+    const T* absmax,     // shape: [(N * K + block_size - 1) / block_size]
+    int32_t N,
+    int32_t K,
+    onnxruntime::concurrency::ThreadPool* thread_pool) {
+  int32_t numel = N * K;
+  int32_t total_block_count = (numel + block_size - 1) / block_size;
+
+  concurrency::ThreadPool::TryBatchParallelFor(
+      thread_pool,
+      total_block_count,
+      [&](ptrdiff_t block_idx) {
+        DequantizeBlockBnb4<T, block_size, DATA_TYPE>(
+            src,
+            dst,
+            absmax[block_idx],
+            static_cast<int32_t>(block_idx),
+            numel);
+      },
+      0);
+}
+
+#define DequantizeBlockwiseBn4DataTyped(block_size, quant_type)                       \
+  if (quant_type == FP4)                                                              \
+    DequantizeBlockwiseBnb4<T, block_size, FP4>(dst, src, absmax, N, K, thread_pool); \
+  else                                                                                \
+    DequantizeBlockwiseBnb4<T, block_size, NF4>(dst, src, absmax, N, K, thread_pool);
+
+template <typename T>
+void DequantizeBlockwiseBnb4(
+    T* dst,              // shape: [N, K]
+    const uint8_t* src,  // shape: [(N * K + 1) / 2)]
+    const T* absmax,     // shape: [(N * K + block_size - 1) / block_size]
+    int32_t block_size,
+    int32_t quant_type,
+    int32_t N,
+    int32_t K,
+    onnxruntime::concurrency::ThreadPool* thread_pool) {
+  ORT_ENFORCE(
+      quant_type == FP4 || quant_type == NF4,
+      "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+
+  if (block_size == 16) {
+    DequantizeBlockwiseBn4DataTyped(16, quant_type);
+  } else if (block_size == 32) {
+    DequantizeBlockwiseBn4DataTyped(32, quant_type);
+  } else if (block_size == 64) {
+    DequantizeBlockwiseBn4DataTyped(64, quant_type);
+  } else if (block_size == 128) {
+    DequantizeBlockwiseBn4DataTyped(128, quant_type);
+  } else if (block_size == 256) {
+    DequantizeBlockwiseBn4DataTyped(256, quant_type);
+  } else {
+    ORT_NOT_IMPLEMENTED("only block size 16, 32, 64, 128, 256 are supported.");
+  }
+}
+
+#undef DequantizeBlockwiseBn4DataTyped
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc
new file mode 100644
index 0000000000000..2f3ede49c3650
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc
@@ -0,0 +1,109 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+#include "core/framework/op_kernel.h"
+#include "core/providers/cpu/math/matmul_helper.h"
+#include "core/providers/common.h"
+#include "dequantize_blockwise_bnb4.h"
+#include "core/mlas/inc/mlas.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+class MatMulBnb4 final : public OpKernel {
+ public:
+  MatMulBnb4(const OpKernelInfo& info) : OpKernel(info) {
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("K", &K_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("quant_type", &quant_type_));
+    ORT_ENFORCE(
+        quant_type_ == FP4 || quant_type_ == NF4,
+        "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+  }
+
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  int64_t K_;
+  int64_t N_;
+  int64_t block_size_;
+  int64_t quant_type_;
+};
+
+Status MatMulBnb4::Compute(OpKernelContext* ctx) const {
+  concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
+
+  const Tensor* a = ctx->Input<Tensor>(0);
+  const Tensor* b_quant = ctx->Input<Tensor>(1);
+  const Tensor* absmax = ctx->Input<Tensor>(2);
+
+  const float* a_data = a->Data<float>();
+  const uint8_t* b_quant_data = b_quant->Data<uint8_t>();
+  const float* absmax_data = absmax->Data<float>();
+
+  AllocatorPtr allocator;
+  auto status = ctx->GetTempSpaceAllocator(&allocator);
+  ORT_RETURN_IF_ERROR(status);
+  auto tmp_b_data_ptr = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_);
+  DequantizeBlockwiseBnb4<float>(
+      tmp_b_data_ptr.get(),
+      b_quant_data,
+      absmax_data,
+      static_cast<int32_t>(block_size_),
+      static_cast<int32_t>(quant_type_),
+      static_cast<int32_t>(N_),
+      static_cast<int32_t>(K_),
+      thread_pool);
+
+  constexpr bool transa = false;
+  constexpr bool transb = true;
+  TensorShape b_shape({N_, K_});
+  MatMulComputeHelper helper;
+  ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape, transa, transb));
+
+  Tensor* y = ctx->Output(0, helper.OutputShape());
+
+  // Bail out early if the output is going to be empty
+  if (y->Shape().Size() == 0) return Status::OK();
+
+  auto* y_data = y->MutableData<float>();
+
+  const size_t max_len = helper.OutputOffsets().size();
+  const size_t M = static_cast<size_t>(helper.M());
+  const size_t N = static_cast<size_t>(helper.N());
+  const size_t K = static_cast<size_t>(helper.K());
+  const size_t lda = helper.Lda(transa);
+  const size_t ldb = helper.Ldb(transb);
+
+  // TODO: implement with native kernel
+  std::vector<MLAS_SGEMM_DATA_PARAMS> data(max_len);
+  for (size_t i = 0; i < max_len; i++) {
+    data[i].BIsPacked = false;
+    data[i].A = a_data + helper.LeftOffsets()[i];
+    data[i].lda = lda;
+    data[i].B = tmp_b_data_ptr.get() + helper.RightOffsets()[i];
+    data[i].ldb = ldb;
+    data[i].C = y_data + helper.OutputOffsets()[i];
+    data[i].ldc = N;
+    data[i].alpha = 1.f;
+    data[i].beta = 0.0f;
+  }
+  MlasGemmBatch(CblasNoTrans, CblasTrans, M, N, K, data.data(), max_len, thread_pool);
+
+  return Status::OK();
+}
+
+ONNX_OPERATOR_KERNEL_EX(
+    MatMulBnb4,
+    kMSDomain,
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
+    MatMulBnb4);
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
index e86a12d9fb873..4e103c2556a7a 100644
--- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
@@ -20,20 +20,29 @@ namespace contrib {
       kCpuExecutionProvider,                                      \
       KernelDefBuilder()                                          \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      SkipLayerNorm<T>);
+      SkipLayerNorm<T, false>);                                   \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      SkipSimplifiedLayerNormalization,                           \
+      kMSDomain,                                                  \
+      1,                                                          \
+      T,                                                          \
+      kCpuExecutionProvider,                                      \
+      KernelDefBuilder()                                          \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      SkipLayerNorm<T, true>);
 
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(double)
 
-template <typename T>
-SkipLayerNorm<T>::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
+template <typename T, bool simplified>
+SkipLayerNorm<T, simplified>::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
     : OpKernel(op_kernel_info) {
   ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &epsilon_).IsOK());
   ORT_ENFORCE(epsilon_ >= 0);
 }
 
-template <typename T>
-Status SkipLayerNorm<T>::Compute(OpKernelContext* p_ctx) const {
+template <typename T, bool simplified>
+Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
   const Tensor* input = p_ctx->Input<Tensor>(0);
   const Tensor* skip = p_ctx->Input<Tensor>(1);
   const Tensor* gamma = p_ctx->Input<Tensor>(2);
@@ -102,10 +111,16 @@ Status SkipLayerNorm<T>::Compute(OpKernelContext* p_ctx) const {
         }
 
         mean = mean / hidden_size;
-        mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_);
+        if (simplified) {
+          mean_square = sqrt(mean_square / hidden_size + epsilon_);
+        } else {
+          mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_);
+        }
 
         for (int64_t h = 0; h < hidden_size; h++) {
-          if (nullptr == beta_data) {
+          if (simplified) {
+            p_output[h] = p_output[h] / mean_square * gamma_data[h];
+          } else if (nullptr == beta_data) {
             p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h];
           } else {
             p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h];
diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
index 7723541cb6b18..69edf4609e340 100644
--- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
@@ -10,7 +10,7 @@
 namespace onnxruntime {
 namespace contrib {
 
-template <typename T>
+template <typename T, bool simplified>
 class SkipLayerNorm final : public OpKernel {
  public:
   SkipLayerNorm(const OpKernelInfo& op_kernel_info);
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
new file mode 100644
index 0000000000000..b4b5dac1fbe19
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.cc
@@ -0,0 +1,84 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cuda_common.h"
+#include "contrib_ops/cpu/bert/rotary_embedding_helper.h"
+#include "contrib_ops/cuda/bert/rotary_embedding.h"
+#include "contrib_ops/cuda/bert/rotary_embedding_impl.h"
+
+using namespace onnxruntime::cuda;
+using namespace ::onnxruntime::common;
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::contrib::rotary_embedding_helper;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#define REGISTER_KERNEL_TYPED(T)                                        \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                        \
+      RotaryEmbedding,                                                  \
+      kMSDomain,                                                        \
+      1,                                                                \
+      T,                                                                \
+      kCudaExecutionProvider,                                           \
+      (*KernelDefBuilder::Create())                                     \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())        \
+          .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>()), \
+      RotaryEmbedding<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(MLFloat16)
+
+template <typename T>
+RotaryEmbedding<T>::RotaryEmbedding(const OpKernelInfo& info) : CudaKernel(info) {
+  scale = info.GetAttrOrDefault<float>("scale", 1.0);
+  interleaved = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1);
+}
+
+template <typename T>
+Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const Tensor* position_ids = context->Input<Tensor>(1);
+  const Tensor* cos_cache = context->Input<Tensor>(2);
+  const Tensor* sin_cache = context->Input<Tensor>(3);
+
+  RotaryParameters parameters = {};
+  ORT_RETURN_IF_ERROR(rotary_embedding_helper::CheckInputs<Tensor>(input,
+                                                                   position_ids,
+                                                                   cos_cache,
+                                                                   sin_cache,
+                                                                   &parameters));
+
+  Tensor* output = context->Output(0, input->Shape());
+
+  if (parameters.sequence_length > parameters.max_sequence_length) {
+    // Launch update_cos_sin_cache kernel with scale
+    ORT_NOT_IMPLEMENTED("Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported");
+  }
+
+  // Launch rotary embedding kernel
+  typedef typename ToCudaType<T>::MappedType CudaT;
+  auto& device_prop = GetDeviceProp();
+  return LaunchRotaryEmbeddingKernel<CudaT>(
+      Stream(context),
+      reinterpret_cast<CudaT*>(output->template MutableData<T>()),
+      reinterpret_cast<const CudaT*>(input->template Data<T>()),
+      position_ids->Data<int64_t>(),
+      reinterpret_cast<const CudaT*>(cos_cache->template Data<T>()),
+      reinterpret_cast<const CudaT*>(sin_cache->template Data<T>()),
+      parameters.batch_size,
+      parameters.sequence_length,
+      parameters.num_heads,
+      parameters.head_size,
+      parameters.max_sequence_length,
+      parameters.position_ids_format,
+      interleaved,
+      device_prop.maxThreadsPerBlock);
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h
new file mode 100644
index 0000000000000..6dab2ad56749e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding.h
@@ -0,0 +1,27 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+using namespace onnxruntime::cuda;
+
+template <typename T>
+class RotaryEmbedding final : public CudaKernel {
+ public:
+  RotaryEmbedding(const OpKernelInfo& info);
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ protected:
+  float scale;
+  bool interleaved;
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
new file mode 100644
index 0000000000000..c54e72dcfce13
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.cu
@@ -0,0 +1,141 @@
+/*
+Copyright (c) Microsoft Corporation.
+Licensed under the MIT License.
+*/
+
+/*
+Kernel implementation for rotary embeddings.
+*/
+
+#include <cuda_fp16.h>
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "contrib_ops/cuda/bert/rotary_embedding_impl.h"
+
+using namespace onnxruntime::cuda;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+template <typename T>
+__global__ void RotaryEmbeddingBSNH(T* output,                   // BxSxNxH
+                                    const T* input,              // BxSxNxH
+                                    const T* cos_cache,          // Mx(H/2)
+                                    const T* sin_cache,          // Mx(H/2)
+                                    const int64_t* position_ids, // (1) or BxS
+                                    const int sequence_length,
+                                    const int num_heads,
+                                    const int head_size,
+                                    const int position_ids_format,
+                                    const bool interleaved) {
+  // B = batch size, S = sequence length, N = num heads, H = head size, M = max sequence length
+  // Use .x in innermost loop to access global memory efficiently
+  
+  const int b = blockIdx.z;
+  const int s = blockIdx.y;
+  const int n = blockIdx.x;
+
+  const int i = threadIdx.x;
+
+  const int block_offset = b * sequence_length * num_heads + s * num_heads + n;
+  const int data_offset = block_offset * head_size;
+
+  const T* input_data = input + data_offset;
+  T* output_data = output + data_offset;
+
+  // Cache is (M, H/2)
+  const int half_head_size = head_size / 2;
+  const int position_id = (position_ids_format == 0) ? \
+                          static_cast<int>(position_ids[0]) + s \
+                          : static_cast<int>(position_ids[b * sequence_length + s]);
+  const int cache_offset = position_id * half_head_size;
+  const T* cos_data = cos_cache + cache_offset;
+  const T* sin_data = sin_cache + cache_offset;
+
+  int cache_idx = 0;
+  T sign = 0;
+  int j = 0;
+  if (interleaved) {
+    cache_idx = (i / 2) % half_head_size;
+    sign = (i % 2 == 0) ? -1 : 1;
+    j = (i % 2 == 0) ? i+1 : i-1;  // i - sign
+  } else {
+    cache_idx = i % half_head_size;
+    sign = (i < half_head_size) ? -1 : 1;
+    j = (i + half_head_size) % head_size;
+  }
+  output_data[i] = input_data[i] * cos_data[cache_idx] + sign * input_data[j] * sin_data[cache_idx];
+}
+
+
+template <typename T>
+Status LaunchRotaryEmbeddingKernel(
+    cudaStream_t stream,
+    T* output,
+    const T* input,
+    const int64_t* position_ids,
+    const T* cos_cache,
+    const T* sin_cache,
+    const int batch_size,
+    const int sequence_length,
+    const int num_heads,
+    const int head_size,
+    const int max_sequence_length,
+    const int position_ids_format,
+    const bool interleaved,
+    const int max_threads_per_block) {
+
+  constexpr int smem_size = 0;
+  const dim3 grid(num_heads, sequence_length, batch_size);
+  const dim3 block(head_size, 1, 1);
+
+  // Note: Current implementation assumes head_size <= max_threads_per_block
+  // because head_size is currently large for LLaMA-2. For smaller head_size
+  // and num_heads values, we can create a block as `block(num_heads, head_size, 1)`
+  // instead. This will require kernel changes to support.
+
+  assert(head_size <= max_threads_per_block);
+  RotaryEmbeddingBSNH<<<grid, block, smem_size, stream>>>(
+    output, input, cos_cache, sin_cache, position_ids,
+    sequence_length, num_heads, head_size, position_ids_format, interleaved
+  );
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+template Status LaunchRotaryEmbeddingKernel<float>(
+    cudaStream_t stream,
+    float* output,
+    const float* input,
+    const int64_t* position_ids,
+    const float* cos_cache,
+    const float* sin_cache,
+    const int batch_size,
+    const int sequence_length,
+    const int num_heads,
+    const int head_size,
+    const int max_sequence_length,
+    const int position_ids_format,
+    const bool interleaved,
+    const int max_threads_per_block);
+
+template Status LaunchRotaryEmbeddingKernel<half>(
+    cudaStream_t stream,
+    half* output,
+    const half* input,
+    const int64_t* position_ids,
+    const half* cos_cache,
+    const half* sin_cache,
+    const int batch_size,
+    const int sequence_length,
+    const int num_heads,
+    const int head_size,
+    const int max_sequence_length,
+    const int position_ids_format,
+    const bool interleaved,
+    const int max_threads_per_block);
+
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h
new file mode 100644
index 0000000000000..29ff48a8ad0fb
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/rotary_embedding_impl.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+template <typename T>
+Status LaunchRotaryEmbeddingKernel(
+    cudaStream_t stream,
+    T* output,
+    const T* input,
+    const int64_t* position_ids,
+    const T* cos_cache,
+    const T* sin_cache,
+    const int batch_size,
+    const int sequence_length,
+    const int num_heads,
+    const int head_size,
+    const int max_sequence_length,
+    const int position_ids_format,
+    const bool interleaved,
+    const int max_threads_per_block);
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index 388197697950e..16d5a9fd7749e 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -99,6 +99,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RotaryEmbedding);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, RotaryEmbedding);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Sampling);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ScaledTanh);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ScaledTanh);
@@ -124,6 +126,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Inverse);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulNBits);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulNBits);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulBnb4);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulBnb4);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Trilu);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, UnfoldTensor);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, DynamicTimeWarping);
@@ -265,6 +269,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, RotaryEmbedding)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Sampling)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ScaledTanh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ScaledTanh)>,
@@ -290,6 +296,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Inverse)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulNBits)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulNBits)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulBnb4)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulBnb4)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasSoftmax)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasDropout)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BitmaskDropout)>,
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu
new file mode 100644
index 0000000000000..e58723f0b31e1
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu
@@ -0,0 +1,129 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cub/cub.cuh>
+#include <cuda_fp16.h>
+#include "core/providers/cuda/cuda_common.h"
+#include "contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h"
+#include "dequantize_blockwise_bnb4.cuh"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+template <class T>
+Status SetBnbQuantMap(int quant_type, T* quant_map_buffer, cudaStream_t stream) {
+  ORT_ENFORCE(
+      quant_type == FP4 || quant_type == NF4,
+      "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+
+  T host_quant_map[16];
+  switch (quant_type) {
+    case FP4:
+      for (int i = 0; i < 16; i++) host_quant_map[i] = static_cast<T>(fp4_qaunt_map[i]);
+      break;
+    case NF4:
+      for (int i = 0; i < 16; i++) host_quant_map[i] = static_cast<T>(nf4_qaunt_map[i]);
+      break;
+  }
+  CUDA_CALL_THROW(cudaMemcpyAsync(quant_map_buffer, host_quant_map, sizeof(T) * 16, cudaMemcpyHostToDevice, stream));
+
+  return Status::OK();
+}
+
+template Status SetBnbQuantMap<float>(int quant_type, float* quant_map_buffer, cudaStream_t stream);
+
+template Status SetBnbQuantMap<half>(int quant_type, half* quant_map_buffer, cudaStream_t stream);
+
+template <class T, int TILE_SIZE, int THREADS, int NUM_PER_TH>
+__global__ void kDequantizeBlockwise(
+    const T* quant_map,
+    T* output,
+    const uint8_t* quant_data,
+    const T* absmax,
+    const int block_size,
+    const int n) {
+  const int n_load = (gridDim.x * TILE_SIZE);
+  int valid_items_load = 0;
+  int valid_items_store = 0;
+  const int base_idx = (blockIdx.x * TILE_SIZE);
+
+  T vals[NUM_PER_TH * 2];
+  uint8_t qvals[NUM_PER_TH];
+  T local_abs_max = T(0.0f);
+
+  typedef cub::BlockLoad<uint8_t, THREADS, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+  typedef cub::BlockStore<T, THREADS, NUM_PER_TH * 2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+
+  __shared__ typename LoadChar::TempStorage loadchar;
+  __shared__ typename StoreT::TempStorage storet;
+
+  for (unsigned int i = base_idx; i < n_load; i += gridDim.x * TILE_SIZE) {
+    valid_items_load = (n + 1) / 2 - i > TILE_SIZE ? TILE_SIZE : (n + 1) / 2 - i;
+    valid_items_store = n - i * 2 > TILE_SIZE * 2 ? TILE_SIZE * 2 : n - i * 2;
+
+    local_abs_max = __ldg(&absmax[(i + threadIdx.x * NUM_PER_TH) / (block_size)]);
+
+    __syncthreads();
+    LoadChar(loadchar).Load(&(quant_data[i]), qvals, valid_items_load, 128);
+
+    #pragma unroll NUM_PER_TH
+    for (int j = 0; j < NUM_PER_TH; j++) {
+      #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
+        vals[j * 2] = quant_map[qvals[j] >> 4] * local_abs_max;
+        vals[j * 2 + 1] = quant_map[qvals[j] & 0x0F] * local_abs_max;
+      #else
+        // half multiplication not supported
+        vals[j * 2] = static_cast<T>(static_cast<float>(quant_map[qvals[j] >> 4]) * static_cast<float>(local_abs_max));
+        vals[j * 2 + 1] =
+            static_cast<T>(static_cast<float>(quant_map[qvals[j] & 0x0F]) * static_cast<float>(local_abs_max));
+      #endif
+    }
+
+    __syncthreads();
+    StoreT(storet).Store(&(output[i * 2]), vals, valid_items_store);
+  }
+}
+
+template <class T>
+Status DequantizeBnb4(
+    const T* quant_map,
+    T* output,
+    const uint8_t* quant_data,
+    const T* absmax,
+    int block_size,
+    int numel,
+    cudaStream_t stream) {
+  int tile_size = 1024;
+  kDequantizeBlockwise<T, 512, 64, 8><<<(numel + tile_size - 1) / tile_size, 64, 0, stream>>>(
+      quant_map,
+      output,
+      quant_data,
+      absmax,
+      block_size / 2,
+      numel);
+
+  return Status::OK();
+}
+
+template Status DequantizeBnb4<float>(
+    const float* quant_map,
+    float* output,
+    const uint8_t* quant_data,
+    const float* absmax,
+    int block_size,
+    int numel,
+    cudaStream_t stream);
+
+template Status DequantizeBnb4<half>(
+    const half* quant_map,
+    half* output,
+    const uint8_t* quant_data,
+    const half *absmax,
+    int block_size,
+    int numel,
+    cudaStream_t stream);
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cuh b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cuh
new file mode 100644
index 0000000000000..4aef3ab699f9c
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cuh
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+template <class T>
+Status SetBnbQuantMap(int quant_type, T* quant_map_buffer, cudaStream_t stream);
+
+template <class T>
+Status DequantizeBnb4(
+    const T* quant_map,
+    T* output,
+    const uint8_t* quant_data,
+    const T* absmax,
+    int block_size,
+    int numel,
+    cudaStream_t stream);
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc
new file mode 100644
index 0000000000000..bd5b6e0a8a1ce
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc
@@ -0,0 +1,144 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+#include "core/providers/cuda/cuda_kernel.h"
+#include "core/providers/cuda/shared_inc/fpgeneric.h"
+#include "core/providers/cpu/math/matmul_helper.h"
+#include "contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h"
+#include "matmul_bnb4.cuh"
+#include "dequantize_blockwise_bnb4.cuh"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+using namespace onnxruntime::cuda;
+
+template <typename T>
+class MatMulBnb4 final : public CudaKernel {
+ public:
+  MatMulBnb4(const OpKernelInfo& info) : CudaKernel(info) {
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("K", &K_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
+    ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("quant_type", &quant_type_));
+    ORT_ENFORCE(
+        quant_type_ == FP4 || quant_type_ == NF4,
+        "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  int64_t K_;
+  int64_t N_;
+  int64_t block_size_;
+  int64_t quant_type_;
+};
+
+template <typename T>
+Status MatMulBnb4<T>::ComputeInternal(OpKernelContext* ctx) const {
+  const Tensor* a = ctx->Input<Tensor>(0);
+  const Tensor* b_quant = ctx->Input<Tensor>(1);
+  const Tensor* absmax = ctx->Input<Tensor>(2);
+
+  const auto* a_data = a->Data<T>();
+  const uint8_t* b_quant_data = b_quant->Data<uint8_t>();
+  const auto* absmax_data = absmax->Data<T>();
+
+  typedef typename ToCudaType<T>::MappedType CudaT;
+
+  // TODO: find a better way to create the quant_map without using a buffer
+  // don't want to use malloc directly so asking from the caller
+  // can create a __device__ static array for float but doesn't work for half
+  IAllocatorUniquePtr<T> quant_map_buffer = GetScratchBuffer<T>(16, ctx->GetComputeStream());
+  auto* quant_map_buffer_data = quant_map_buffer.get();
+  ORT_RETURN_IF_ERROR(SetBnbQuantMap<CudaT>(
+      SafeInt<int>(quant_type_),
+      reinterpret_cast<CudaT*>(quant_map_buffer_data),
+      static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
+
+  constexpr bool transa = false;
+  constexpr bool transb = true;
+  MatMulComputeHelper helper;
+  TensorShape b_shape({N_, K_});
+  ORT_RETURN_IF_ERROR(
+      helper.Compute(a->Shape(), b_shape, transa, transb));
+
+  Tensor* Y = ctx->Output(0, helper.OutputShape());
+  // Bail out early if the output is going to be empty
+  if (Y->Shape().Size() == 0) return Status::OK();
+
+  bool is_4bit_done = TryMatMulBnb4(
+      reinterpret_cast<const CudaT*>(quant_map_buffer_data),
+      reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+      reinterpret_cast<const CudaT*>(a_data),
+      b_quant_data,
+      reinterpret_cast<const CudaT*>(absmax_data),
+      SafeInt<int>(helper.M()),
+      SafeInt<int>(helper.N()),
+      SafeInt<int>(helper.K()),
+      SafeInt<int>(block_size_),
+      static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
+
+  if (!is_4bit_done) {
+    IAllocatorUniquePtr<T> b_dequant_ptr = GetScratchBuffer<T>(N_ * K_, ctx->GetComputeStream());
+    auto* b_dequant_data = b_dequant_ptr.get();
+    ORT_RETURN_IF_ERROR(DequantizeBnb4<CudaT>(
+        reinterpret_cast<const CudaT*>(quant_map_buffer_data),
+        reinterpret_cast<CudaT*>(b_dequant_data),
+        b_quant_data,
+        reinterpret_cast<const CudaT*>(absmax_data),
+        SafeInt<int>(block_size_),
+        SafeInt<int>(N_ * K_),
+        static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
+
+    const CudaT alpha = ToCudaType<T>::FromFloat(1.f);
+    const CudaT zero = ToCudaType<T>::FromFloat(0.f);
+
+    CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+        GetCublasHandle(ctx),
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        SafeInt<int>(helper.N()),
+        SafeInt<int>(helper.M()),
+        SafeInt<int>(helper.K()),
+        &alpha,
+        reinterpret_cast<const CudaT*>(b_dequant_data),
+        SafeInt<int>(K_),
+        reinterpret_cast<const CudaT*>(a_data),
+        helper.Lda(transa),
+        &zero,
+        reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+        helper.Ldc(),
+        GetDeviceProp()));
+  }
+
+  return Status::OK();
+}
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    MatMulBnb4,
+    kMSDomain,
+    1,
+    float,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
+    MatMulBnb4<float>);
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    MatMulBnb4,
+    kMSDomain,
+    1,
+    MLFloat16,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<MLFloat16>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>()),
+    MatMulBnb4<MLFloat16>);
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cu
new file mode 100644
index 0000000000000..1d9aa75ff3701
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cu
@@ -0,0 +1,192 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <type_traits>
+
+#include <cub/cub.cuh>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include "matmul_bnb4.cuh"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#define num_values_4bit 32
+template <class T, int THREADS, int BITS>
+__global__ void kgemm_4bit_inference_naive(
+    int M,
+    int N,
+    int K,
+    const T* __restrict__ A,
+    const uint8_t* B,
+    const T* absmax,
+    const T* datatype,
+    T* out,
+    int lda,
+    int ldb,
+    int ldc,
+    int block_size) {
+  // per threadblock:
+  // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
+  // 4 warps -> 4 loads per iter
+  // 1x32 * 32x4 -> 1x4 outputs per thread block
+  typedef cub::WarpReduce<float> WarpReduce;
+  __shared__ typename WarpReduce::TempStorage temp_storage[THREADS / 32];
+
+  const int warp_idx = threadIdx.x / 32;
+  const int warp_lane = threadIdx.x % 32;
+  const int row_B = (THREADS / 32) * blockIdx.x + warp_idx;
+  const int num_values_8bit = num_values_4bit / 2;
+  float local_C = 0.0f;
+
+  uint8_t local_B_4bit[num_values_8bit];
+  T local_B[num_values_4bit / 4];
+  T local_A[num_values_4bit / 4];
+  __shared__ T quant_map[16];
+  T local_absmax = T(0.0f);
+
+  for (int i = threadIdx.x; i < 16; i++) quant_map[i] = T(datatype[i]);
+  __syncthreads();
+
+  // A: [1, K]
+  // B: [N, K]
+  for (int inner_idx = warp_lane * num_values_4bit; inner_idx < K; inner_idx += 32 * num_values_4bit) {
+    int inner_idx_halved = inner_idx / 2;
+    int offset_B = ldb * row_B;
+    int absidx = ((2 * offset_B) + inner_idx) / block_size;
+    local_absmax = __ldg(&(absmax[absidx]));
+
+    if (row_B < N) {
+      if ((inner_idx_halved + num_values_8bit) < (K / 2)) {
+        // this is the most important for performance considerations
+        reinterpret_cast<int4(&)[num_values_8bit]>(local_B_4bit)[0] =
+            reinterpret_cast<const int4*>(B)[(offset_B + (inner_idx_halved)) / (num_values_8bit)];
+      } else {
+        #pragma unroll
+        for (int j = 0; j < (num_values_8bit); j++)
+          if ((inner_idx_halved) + j < (K / 2))
+            local_B_4bit[j] = B[offset_B + inner_idx_halved + j];
+          else
+            local_B_4bit[j] = 0b01110111;
+      }
+    } else {
+      #pragma unroll
+      for (int j = 0; j < (num_values_8bit); j++) local_B_4bit[j] = 0b01110111;
+    }
+
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int k = 0; k < num_values_8bit / 4; k++) {
+        #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
+          local_B[k * 2] = quant_map[local_B_4bit[(i * num_values_8bit / 4) + k] >> 4] * local_absmax;
+          local_B[k * 2 + 1] = quant_map[local_B_4bit[(i * num_values_8bit / 4) + k] & 0x0F] * local_absmax;
+        #else
+          // half multiplication not supported
+          local_B[k * 2] =
+              static_cast<T>(static_cast<float>(quant_map[local_B_4bit[(i * num_values_8bit / 4) + k] >> 4]) *
+                            static_cast<float>(local_absmax));
+          local_B[k * 2 + 1] =
+              static_cast<T>(static_cast<float>(quant_map[local_B_4bit[(i * num_values_8bit / 4) + k] & 0x0F]) *
+                            static_cast<float>(local_absmax));
+        #endif
+      }
+
+      if (inner_idx + (num_values_4bit / 4) + (i * num_values_4bit / 4) < K) {
+        // this is also relatively important for performance
+        if (BITS == 16) {
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[0] =
+              reinterpret_cast<const int4*>(A)[inner_idx / (num_values_4bit / 4) + i];
+        } else {
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[0] =
+              reinterpret_cast<const int4*>(A)[inner_idx / (num_values_4bit / 8) + (2 * i) + 0];
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[1] =
+              reinterpret_cast<const int4*>(A)[inner_idx / (num_values_4bit / 8) + (2 * i) + 1];
+        }
+      } else {
+        #pragma unroll
+        for (int k = 0; k < num_values_4bit / 4; k++) {
+          if (inner_idx + (i * num_values_4bit / 4) + k < K)
+            local_A[k] = A[inner_idx + k + (i * num_values_4bit / 4)];
+          else
+            local_A[k] = T(0.0f);
+        }
+      }
+
+      // accumulate in float; small performance hit for Ampere, but lower error for outputs
+      #pragma unroll
+      for (int k = 0; k < num_values_4bit / 4; k++) {
+        #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
+          local_C += static_cast<float>(local_A[k] * local_B[k]);
+        #else
+          // half multiplication not supported
+          local_C += static_cast<float>(local_A[k]) * static_cast<float>(local_B[k]);
+        #endif
+      }
+    }
+  }
+
+  local_C = WarpReduce(temp_storage[warp_idx]).Sum(local_C);
+
+  if (row_B < N && warp_lane == 0) out[row_B] = T(local_C);
+}
+
+template <class T>
+bool TryMatMulBnb4(
+    const T* quant_map,
+    T* output,
+    const T* a_data,
+    const uint8_t* b_data_quant,
+    const T* absmax,
+    int m,
+    int n,
+    int k,
+    int block_size,
+    cudaStream_t stream) {
+  if (k % block_size != 0 || m > 1) {
+    return false;
+  }
+  // supported block_sizes are [4096, 2048, 1024, 512, 256, 128, 64, 32]
+  if (block_size % 32 != 0 || block_size > 4096) {
+    return false;
+  }
+
+  int lda = k;
+  int ldb = (k + 1) / 2;
+  int ldc = n;
+  int num_blocks = (n + 3) / 4;
+
+  constexpr int bits = std::is_same_v<T, half> ? 16 : 32;
+  kgemm_4bit_inference_naive<T, 128, bits><<<num_blocks, 128, 0, stream>>>(
+      m, n, k, a_data, b_data_quant, absmax, quant_map, output, lda, ldb, ldc, block_size);
+
+  return true;
+}
+
+template bool TryMatMulBnb4<float>(
+    const float* quant_map,
+    float* output,
+    const float* a_data,
+    const uint8_t* b_data_quant,
+    const float* absmax,
+    int m,
+    int n,
+    int k,
+    int block_size,
+    cudaStream_t stream);
+
+template bool TryMatMulBnb4<half>(
+    const half* quant_map,
+    half* output,
+    const half* a_data,
+    const uint8_t* b_data_quant,
+    const half* absmax,
+    int m,
+    int n,
+    int k,
+    int block_size,
+    cudaStream_t stream);
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cuh b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cuh
new file mode 100644
index 0000000000000..743234282fbf3
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cuh
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+template <class T>
+bool TryMatMulBnb4(
+    const T* quant_map,
+    T* output,
+    const T* a_data,
+    const uint8_t* b_data_quant,
+    const T* absmax,
+    int m,
+    int n,
+    int k,
+    int block_size,
+    cudaStream_t stream);
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index 6a82b3fcc734d..655d5014f3d60 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -22,6 +22,14 @@
 #define HWCAP_ASIMDDP (1 << 20)
 #endif
 
+#ifndef HWCAP2_I8MM
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
+#ifndef HWCAP2_SVEI8MM
+#define HWCAP2_SVEI8MM (1 << 9)
+#endif
+
 #endif  // ARM
 
 #endif  // Linux
@@ -138,6 +146,9 @@ void CPUIDInfo::ArmLinuxInit() {
   is_hybrid_ = cpuinfo_get_uarchs_count() > 1;
   has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot();
   has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
+  has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm();
+  has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm();
+
   const uint32_t core_cnt = cpuinfo_get_cores_count();
   core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown);
   is_armv8_narrow_ld_.resize(core_cnt, false);
@@ -162,6 +173,10 @@ void CPUIDInfo::ArmLinuxInit() {
   pytorch_cpuinfo_init_ = false;
   has_arm_neon_dot_ = ((getauxval(AT_HWCAP) & HWCAP_ASIMDDP) != 0);
   has_fp16_ |= has_arm_neon_dot_;
+
+  has_arm_neon_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_I8MM) != 0);
+  has_arm_sve_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_SVEI8MM) != 0);
+
 #endif
 }
 
@@ -256,6 +271,9 @@ void CPUIDInfo::ArmWindowsInit() {
 
   has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0);
   has_fp16_ |= has_arm_neon_dot_;
+  /* TODO: implement them when hw+sw is available for testing these features */
+  has_arm_neon_i8mm_ = false;
+  has_arm_sve_i8mm_ = false;
 }
 
 #endif /* (arm or arm64) and windows */
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index 386db347c669d..a15c75104b83a 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -28,6 +28,8 @@ class CPUIDInfo {
 
   // ARM
   bool HasArmNeonDot() const { return has_arm_neon_dot_; }
+  bool HasArmNeon_I8MM() const { return has_arm_neon_i8mm_; }
+  bool HasArmSVE_I8MM() const { return has_arm_sve_i8mm_; }
 
   uint32_t GetCurrentCoreIdx() const;
 
@@ -121,6 +123,8 @@ class CPUIDInfo {
 
   bool has_arm_neon_dot_{false};
   bool has_fp16_{false};
+  bool has_arm_neon_i8mm_{false};
+  bool has_arm_sve_i8mm_{false};
 
 #ifdef CPUIDINFO_ARCH_X86
 
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 1b492a3561396..b028596fe4e6d 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -13,7 +13,9 @@
 #include "core/framework/kernel_registry_manager.h"
 #include "core/framework/kernel_registry.h"
 #include "core/graph/function.h"
+#include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
+#include "core/graph/model.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
 // #define COUNT_NON_CUDA_OPS
@@ -129,6 +131,21 @@ struct GetCapabilityForEPParams {
   std::reference_wrapper<const layout_transformation::DebugGraphFn> debug_graph_fn;
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 };
+
+auto get_capabilities = [](const IExecutionProvider& ep,
+                           const GraphViewer& graph_viewer,
+                           const IExecutionProvider::IKernelLookup& kernel_lookup) {
+  auto capabilities = ep.GetCapability(graph_viewer, kernel_lookup);
+
+  // In theory an EP could return an empty capability. Remove those.
+  capabilities.erase(std::remove_if(capabilities.begin(), capabilities.end(),
+                                    [](const std::unique_ptr<ComputeCapability>& capability) {
+                                      return !capability || !capability->sub_graph;
+                                    }),
+                     capabilities.end());
+
+  return capabilities;
+};
 }  // namespace
 
 static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
@@ -143,21 +160,6 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
   }
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
-  auto get_capabilities = [](const IExecutionProvider& ep,
-                             const GraphViewer& graph_viewer,
-                             const IExecutionProvider::IKernelLookup& kernel_lookup) {
-    auto capabilities = ep.GetCapability(graph_viewer, kernel_lookup);
-
-    // In theory an EP could return an empty capability. Remove those.
-    capabilities.erase(std::remove_if(capabilities.begin(), capabilities.end(),
-                                      [](const std::unique_ptr<ComputeCapability>& capability) {
-                                        return !capability || !capability->sub_graph;
-                                      }),
-                       capabilities.end());
-
-    return capabilities;
-  };
-
   const auto& kernel_registry_mgr = params.kernel_registry_mgr.get();
   const auto kernel_registries_for_ep = kernel_registry_mgr.GetKernelRegistriesByProviderType(ep_type);
   const KernelLookup kernel_lookup{ep_type,
@@ -239,6 +241,26 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
 }
 
 #if !defined(ORT_MINIMAL_BUILD)
+
+// This function queries the capabilities for a given EP, but it does not assign the nodes.
+// It also does not perform layout transformation. This will be done during normal partitioning.
+static Status GetCapabilityForEPForAotInlining(const GraphViewer& graph_viewer,
+                                               const KernelRegistryManager& kernel_registry_mgr,
+                                               const IExecutionProvider& current_ep,
+                                               std::vector<std::unique_ptr<ComputeCapability>>& capabilities) {
+  const auto& ep_type = current_ep.Type();
+
+  const auto kernel_registries_for_ep = kernel_registry_mgr.GetKernelRegistriesByProviderType(ep_type);
+  const KernelLookup kernel_lookup{ep_type,
+                                   kernel_registries_for_ep,
+                                   kernel_registry_mgr.GetKernelTypeStrResolver()};
+
+  // TODO: Provide EP with a capability to look inside the functions.
+  capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup);
+
+  return Status::OK();
+}
+
 /**
  * Check if a node can be placed on a specific provider.
  * Do nothing if the node is already assigned
@@ -518,7 +540,7 @@ static Status InlineNodes(Graph& graph, bool& modified_graph) {
   // successfully inlined, we re-run the partitioner on the modified graph.
   // NOTE: Inlining the function will change the nodes in the Graph instance, so we can't do that while iterating
   // using graph.Nodes().
-  std::vector<Node*> nodes_to_inline;
+  InlinedVector<Node*> nodes_to_inline;
   for (auto& node : graph.Nodes()) {
     if (node.GetExecutionProviderType().empty() && node.CanBeInlined()) {
       nodes_to_inline.push_back(&node);
@@ -533,6 +555,85 @@ static Status InlineNodes(Graph& graph, bool& modified_graph) {
   return Status::OK();
 }
 
+static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_providers,
+                                     const KernelRegistryManager& kernel_registry_mgr,
+                                     Graph& graph,
+                                     InlinedHashSet<std::string>& not_inlined,
+                                     size_t& inlined_count) {
+  // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
+  // doing it here saves all providers checking for this in GetCapability
+  if (graph.NumberOfNodes() == 0) {
+    return Status::OK();
+  }
+
+  for (auto& node : graph.Nodes()) {
+    for (auto& entry : node.GetAttributeNameToMutableSubgraphMap()) {
+      Graph* subgraph = entry.second;
+      // we pass through the FuncManager from the top level graph
+      ORT_RETURN_IF_ERROR(InlineFunctionsAOTImpl(execution_providers,
+                                                 kernel_registry_mgr,
+                                                 *subgraph,
+                                                 not_inlined,
+                                                 inlined_count));
+    }
+  }
+
+  // Gather the candidates
+  InlinedVector<NodeIndex> inline_candidates;
+  for (auto& node : graph.Nodes()) {
+    if (node.CanBeInlined()) {
+      inline_candidates.push_back(node.Index());
+    }
+  }
+
+  if (inline_candidates.empty()) {
+    return Status::OK();
+  }
+
+  // Find out all the nodes that are already taken
+  const GraphViewer graph_viewer(graph);
+
+  InlinedHashSet<NodeIndex> claimed_by_ep;
+  for (const auto& ep : execution_providers) {
+    std::vector<std::unique_ptr<ComputeCapability>> capabilities;
+    ORT_RETURN_IF_ERROR(GetCapabilityForEPForAotInlining(graph_viewer, kernel_registry_mgr, *ep, capabilities));
+    for (auto& capability : capabilities) {
+      const auto& nodes = capability->sub_graph->nodes;
+      if (nodes.size() == 1) {
+        // Single node capability.
+        ORT_IGNORE_RETURN_VALUE(claimed_by_ep.insert(nodes[0]));
+      } else {
+        // Make sure none is claimed by other EPs mirroring the logic in PartitionOnnxFormatModelImpl.
+        if (std::all_of(nodes.cbegin(), nodes.cend(), [&claimed_by_ep](NodeIndex node_index) {
+              return claimed_by_ep.count(node_index) == 0;
+            })) {
+          claimed_by_ep.insert(nodes.cbegin(), nodes.cend());
+        }
+      }
+    }
+  }
+
+  // TODO: Insert version check. We need to collect all the versions
+  // that imported by the model. If the version is not supported by
+  // the model, we can not inline it.
+
+  for (auto node_index : inline_candidates) {
+    auto* node = graph.GetNode(node_index);
+    if (node != nullptr) {
+      if (claimed_by_ep.count(node_index) == 0) {
+        ORT_RETURN_IF_ERROR(graph.InlineFunction(*node));
+        ++inlined_count;
+      } else {
+        // OpType is the same as function name.
+        auto function_id = function_utils::GetFunctionIdentifier(node->Domain(), node->OpType());
+        ORT_IGNORE_RETURN_VALUE(not_inlined.insert(std::move(function_id)));
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
                                        KernelRegistryManager& kernel_registry_manager) {
@@ -693,6 +794,35 @@ static Status PartitionOrtFormatModel(const PartitionParams& partition_params,
   return Status::OK();
 }
 
+#ifndef ORT_MINIMAL_BUILD
+
+Status GraphPartitioner::InlineFunctionsAOT(Model& model,
+                                            const ExecutionProviders& execution_providers,
+                                            const KernelRegistryManager& kernel_registry_manager) const {
+  auto& graph = model.MainGraph();
+  InlinedHashSet<std::string> not_inlined;
+  do {
+    size_t inlined_count = 0;
+    ORT_RETURN_IF_ERROR(InlineFunctionsAOTImpl(execution_providers,
+                                               kernel_registry_manager,
+                                               graph,
+                                               not_inlined,
+                                               inlined_count));
+
+    if (inlined_count == 0) {
+      break;
+    }
+
+    ORT_RETURN_IF_ERROR(graph.Resolve());
+  } while (true);
+
+  model.RemoveLocalFunctionsProtos(not_inlined);
+
+  return Status::OK();
+}
+
+#endif
+
 Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
                                    Mode mode,
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index 36a27e906c651..c1fa46de9145d 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -12,6 +12,7 @@ namespace onnxruntime {
 
 class ExecutionProviders;
 class KernelRegistryManager;
+class Model;
 
 class GraphPartitioner {
  public:
@@ -33,6 +34,26 @@ class GraphPartitioner {
                    Mode mode = Mode::kNormal,
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
 
+#ifndef ORT_MINIMAL_BUILD
+  /// <summary>
+  // Ahead of Time Function inlining. The main purpose of the function is to inline as many
+  // functions as possible and delete locally defined functions to reduce the size of the model.
+  // This would make other optimizations to be more effective.
+  //
+  // This function performs GetCapability on the graph and its subgraphs bottom up
+  // and inlines any functions that are not claimed by any of the execution providers.
+  // This function does not attempt to run layout transformation, and it does not assign EPs.
+  // The latter will be done by graph partitioning after Level1 optimizations are done.
+  /// </summary>
+  /// <param name="model">model instance</param>
+  /// <param name="execution_providers">execution providers considered</param>
+  /// <param name="kernel_registry_manager">registry manager</param>
+  /// <returns></returns>
+  Status InlineFunctionsAOT(Model& model,
+                            const ExecutionProviders& execution_providers,
+                            const KernelRegistryManager& kernel_registry_manager) const;
+#endif
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphPartitioner);
 
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index 6912160c27702..354160322a3e6 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1054,7 +1054,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                OpSchema::Optional)
         .Input(4,
                "key_padding_mask",
-               "Key padding mask with shape (batch_size) or (3 * batch_size + 2) or (batch_size, kv_sequence_length)",
+               "Key padding mask with shape (batch_size), (3 * batch_size + 2), (batch_size, kv_sequence_length), (batch_size, total_sequence_length), "
+               "or (batch_size, sequence_length, total_sequence_length)",
                "M",
                OpSchema::Optional)
         .Input(5,
@@ -1237,6 +1238,49 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
           DecoderAttentionTypeAndShapeInference(ctx);
         }));
 
+constexpr const char* RotaryEmbedding_ver1_doc = R"DOC(
+RotaryEmbedding is the implementation of rotary positional embeddings (RoPE). The positions are represented as rotation matrices 
+that are multiplied to query and key before the inner product of query and key is taken.
+)DOC";
+ONNX_MS_OPERATOR_SET_SCHEMA(
+    RotaryEmbedding, 1,
+    OpSchema()
+        .SetDoc(RotaryEmbedding_ver1_doc)
+        .Attr("scale",
+              "Custom scale will be used if specified. Default value is 1.0",
+              AttributeProto::FLOAT,
+              OPTIONAL_VALUE)
+        .Attr("interleaved",
+              "Rotate using interleaved pattern. Default value is 0 (False).",
+              AttributeProto::INT,
+              OPTIONAL_VALUE)
+        .Input(0,
+               "input",
+               "3D tensor with shape (batch_size, sequence_length, hidden_size)",
+               "T")
+        .Input(1,
+               "position_ids",
+               "1D tensor with shape (1) or 2D tensor with shape (batch_size, sequence_length)",
+               "M")
+        .Input(2,
+               "cos_cache",
+               "2D tensor with shape (max_sequence_length, head_size / 2).",
+               "T")
+        .Input(3,
+               "sin_cache",
+               "2D tensor with shape (max_sequence_length, head_size / 2).",
+               "T")
+        .Output(0,
+                "output",
+                "3D tensor with shape (batch_size, sequence_length, hidden_size)",
+                "T")
+        .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float tensors.")
+        .TypeConstraint("M", {"tensor(int64)"}, "Constrain input and output types to integer tensors")
+        .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 0);
+          propagateShapeFromInputToOutput(ctx, 0, 0);
+        }));
+
 constexpr const char* EmbedLayerNormalization_ver1_doc = R"DOC(
 EmbedLayerNormalization is the fusion of embedding layer in BERT model, with optional mask processing.
 The embedding layer takes input_ids (word IDs) and segment_ids (sentence IDs) to look up word_embedding, position_embedding,
@@ -1608,4 +1652,4 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         }));
 
 }  // namespace contrib
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 5e5eee568fa21..681a728f823da 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3239,6 +3239,41 @@ Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored
         MatmulWithQuantWeightShapeInference(ctx, in_features, out_features);
       });
 
+  static const char* MatMulBnb4_ver1_doc = R"DOC(
+MatMulBnb4 is a MatMul with weight quantized with 4 bits using either FP4 or NF4 data type (https://arxiv.org/pdf/2305.14314.pdf). It does Matrix Multiplication like MatMul (https://github.com/onnx/onnx/blob/main/docs/Operators.md#matmul) with differences:
+  1. Input B is a 2D constant Matrix. Its input feature count and output feature count are specified by attribute 'K' and 'N'.
+  2. Input B is quantized with 4 bits with quantization data type specified by attribute 'quant_type'. It is transposed, flattened and quantized blockwisely with block size specified by attribute 'block_size'.
+     And block_size is not an arbitrary number and must be a power of 2 and not smaller than 16, like 16, 32, 64, 128,..
+  3. Input B's quantization constants or scales are specified by input 'absmax'.
+
+Input B is stored as uint8_t with shape: [(N * K + 1) / 2].
+Input absmax is stored in same type as original type of B(float32, float16) with shape like: [(N * K + block_size - 1) / block_size].
+
+)DOC";
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulBnb4)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(MatMulBnb4_ver1_doc)
+      .Attr("K", "size of each input feature", AttributeProto::INT)
+      .Attr("N", "size of each output feature", AttributeProto::INT)
+      .Attr("block_size", "number of groupsize used for weight quantization. It needs to be a power of 2 and not smaller than 16.", AttributeProto::INT)
+      .Attr("quant_type", "quantization data type. 0 for FP4, 1 for NF4.", AttributeProto::INT)
+      .Input(0, "A", "The input tensor, not quantized", "T1")
+      .Input(1, "B", "1-dimensional quantized data for weight", "T2")
+      .Input(2, "absmax", "quantization constants", "T1")
+      .Output(0, "Y", "tensor. The output tensor has the same rank as the input. ", "T1")
+      .TypeConstraint("T1", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float/half_float tensors.")
+      .TypeConstraint("T2", {"tensor(uint8)"}, "Constrain quantized weight types to uint8.")
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        // Type inference
+        propagateElemTypeFromInputToOutput(ctx, 0, 0);
+        // Shape inference
+        int64_t in_features = getAttribute(ctx, "K", -1);
+        int64_t out_features = getAttribute(ctx, "N", -1);
+        MatmulWithQuantWeightShapeInference(ctx, in_features, out_features);
+      });
+
 #ifdef ENABLE_ATEN
   ONNX_CONTRIB_OPERATOR_SCHEMA(ATen)
       .SetDomain(kPytorchAtenDomain)
diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h
index a6c6d7857706b..6ddf1c06751cf 100644
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@@ -99,6 +99,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GatedRelativePositionBia
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RemovePadding);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RestorePadding);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Rfft);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RotaryEmbedding);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SampleOp);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Sampling);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SkipLayerNormalization);
@@ -208,6 +209,7 @@ class OpSet_Microsoft_ver1 {
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RemovePadding)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RestorePadding)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Rfft)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RotaryEmbedding)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SampleOp)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Sampling)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SkipLayerNormalization)>());
diff --git a/onnxruntime/core/graph/function_utils.cc b/onnxruntime/core/graph/function_utils.cc
index 7477f48088a15..7b0a834a7ffc0 100644
--- a/onnxruntime/core/graph/function_utils.cc
+++ b/onnxruntime/core/graph/function_utils.cc
@@ -373,7 +373,8 @@ class Inliner {
   // Replace given name with a unique version of the name, and cache the
   // renaming-binding in current scope.
   void make_unique(std::string& name) {
-    auto new_name = prefix_ + name;
+    auto new_name{prefix_};
+    new_name.append("_").append(name);
     auto& current_scope = rename_scopes_.back();
     current_scope[name] = new_name;
     name = std::move(new_name);
@@ -410,7 +411,7 @@ class Inliner {
       std::string rename_as = actuals.Get(i);
       if constexpr (isOutput) {
         if (rename_as.empty())
-          rename_as.assign(prefix_).append(formal);
+          rename_as.assign(prefix_).append("_").append(formal);
       }
       current_scope[formal] = rename_as;
       if (!rename_as.empty())
@@ -420,7 +421,7 @@ class Inliner {
       std::string& formal = *formals.Mutable(i);
       std::string rename_as;
       if constexpr (isOutput) {
-        rename_as.assign(prefix_).append(formal);
+        rename_as.assign(prefix_).append("_").append(formal);
       }
       current_scope[formal] = rename_as;
       if (!rename_as.empty())
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 383c1d689d3c3..cab9467501f55 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -860,18 +860,18 @@ Status Node::LoadEdgesFromOrtFormat(const onnxruntime::fbs::NodeEdge& fbs_node_e
 }
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
-void Node::Init(const std::string& name,
-                const std::string& op_type,
-                const std::string& description,
-                const std::vector<NodeArg*>& input_args,
-                const std::vector<NodeArg*>& output_args,
+void Node::Init(std::string_view name,
+                std::string_view op_type,
+                std::string_view description,
+                gsl::span<NodeArg* const> input_args,
+                gsl::span<NodeArg* const> output_args,
                 const NodeAttributes* attributes,
-                const std::string& domain) {
+                std::string_view domain) {
   name_ = name;
   op_type_ = op_type;
   description_ = description;
-  definitions_.input_defs = input_args;
-  definitions_.output_defs = output_args;
+  definitions_.input_defs.assign(input_args.begin(), input_args.end());
+  definitions_.output_defs.assign(output_args.begin(), output_args.end());
   domain_ = domain;
   can_be_saved_ = true;
   priority_ = 0;
@@ -1145,7 +1145,8 @@ Graph::Graph(const Model& owning_model,
              IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
              const logging::Logger& logger,
              bool strict_shape_type_inference)
-    : Graph(owning_model, graph_proto, domain_to_version, ir_version, schema_registry, nullptr, nullptr, logger, strict_shape_type_inference) {}
+    : Graph(owning_model, graph_proto, domain_to_version, ir_version,
+            schema_registry, nullptr, nullptr, logger, strict_shape_type_inference) {}
 
 Graph::Graph(const Model& owning_model,
              GraphProto* graph_proto, const std::unordered_map<std::string, int>& domain_to_version, Version ir_version,
@@ -3261,8 +3262,8 @@ Node& Graph::AddNode(const std::string& name,
                      gsl::span<NodeArg* const> output_args,
                      const NodeAttributes* attributes,
                      const std::string& domain) {
-  std::vector<NodeArg*> inputs;
-  std::vector<NodeArg*> outputs;
+  InlinedVector<NodeArg*> inputs;
+  InlinedVector<NodeArg*> outputs;
   inputs.resize(input_args.size());
   outputs.resize(output_args.size());
   int i = 0;
@@ -4019,69 +4020,100 @@ Node& Graph::FuseSubGraph(const IndexedSubGraph& sub_graph,
   return fused_node;
 }
 
+Status Graph::AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& node_proto,
+                                            std::optional<std::string_view> new_name) {
+  const gsl::not_null<TensorProto*> tensor{graph_proto_->add_initializer()};
+  ORT_RETURN_IF_ERROR(utils::ConstantNodeProtoToTensorProto(node_proto, ModelPath(), *tensor, node_proto.output(0)));
+
+  if (new_name.has_value()) {
+    tensor->set_name(std::string(new_name.value()));
+  }
+
+  auto insert_result = name_to_initial_tensor_.emplace(tensor->name(), tensor);
+  ORT_ENFORCE(insert_result.second, "Constant node name: ", tensor->name(),
+              " conflicts with graph initializer. Check that the node names have been made unique.");
+  if (GetNodeArg(tensor->name()) == nullptr) {
+    TypeProto t{TypeProtoFromTensorProto(*tensor)};
+    ORT_IGNORE_RETURN_VALUE(GetOrCreateNodeArg(tensor->name(), &t));
+  }
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  if (node_proto.attribute(0).type() == AttributeProto_AttributeType_SPARSE_TENSOR) {
+    ORT_IGNORE_RETURN_VALUE(sparse_tensor_names_.emplace(tensor->name()));
+  }
+#endif
+
+  return Status::OK();
+}
+
+Status Graph::InlineFunctionProto(const ONNX_NAMESPACE::FunctionProto& func_to_inline) {
+  auto to_node_arg = [this](const std::string& name) {
+    return &this->GetOrCreateNodeArg(name, nullptr);
+  };
+
+  // Process constant nodes first and create NodeArg for these as they become initializers
+  // It is important for the initializers to have NodeArg created, first they are needed
+  // if the initializer is unused and removed, second if the node depends on the initializer,
+  // we can have Type attached to it.
+  InlinedVector<const NodeProto*> non_constant_nodes;
+  non_constant_nodes.reserve(func_to_inline.node_size());
+  for (const auto& inlined_node : func_to_inline.node()) {
+    if (inlined_node.op_type() == kConstant) {
+      // Copy constant nodes _value to name_to_initial_tensor_
+      ORT_RETURN_IF_ERROR(AddConstantProtoAsInitializer(inlined_node, std::nullopt));
+    } else {
+      non_constant_nodes.push_back(&inlined_node);
+    }
+  }
+
+  for (const auto* inlined_node : non_constant_nodes) {
+    InlinedVector<onnxruntime::NodeArg*> inputs;
+    InlinedVector<onnxruntime::NodeArg*> outputs;
+
+    for (const auto& tensor_name : inlined_node->input())
+      inputs.push_back(to_node_arg(tensor_name));
+
+    for (const auto& tensor_name : inlined_node->output())
+      outputs.push_back(to_node_arg(tensor_name));
+
+    onnxruntime::NodeAttributes new_attr_map;
+    new_attr_map.reserve(inlined_node->attribute_size());
+    for (const auto& node_attr : inlined_node->attribute()) {
+      new_attr_map.insert_or_assign(node_attr.name(), node_attr);
+    }
+    ORT_IGNORE_RETURN_VALUE(AddNode(inlined_node->name(), inlined_node->op_type(),
+                                    inlined_node->doc_string(), inputs, outputs,
+                                    &new_attr_map, inlined_node->domain()));
+  }
+
+  return Status::OK();
+}
+
 Status Graph::InlineFunction(Node& callnode) {
-  const auto& model_path = ModelPath();
-  auto output_edges = callnode.GetRelationships().output_edges;
+  // Remove output edges. Requirement for RemoveNode() below.
+  auto output_edges = callnode.GetRelationships().output_edges;  // copy so RemoveEdge doesn't invalidate iterator
   for (const auto& output_edge : output_edges) {
     RemoveEdge(callnode.Index(), output_edge.GetNode().Index(), output_edge.GetSrcArgIndex(), output_edge.GetDstArgIndex());
   }
 
   // create a uniq_identifier to append to every node name and intermediate input\outputs
   // to make sure there are no unintended duplicates
-  std::stringstream ss;
-  ss << "_inline_" << callnode.OpType();
-  auto uniq_identifier = GenerateNodeName(ss.str());
+  std::string base_uniq_identifier{"_inlfunc_"};
+  base_uniq_identifier.append(callnode.OpType());
+  const auto uniq_identifier = GenerateNodeName(base_uniq_identifier);
+
   // Replace a (function-call) node by an inlined graph.
   if (!callnode.GetFunctionBody()) {
     // This is the normal use-case: inlining a FunctionProto (representing
     // a model-local function or a schema-defined function).
-    FunctionProto inlined_fp;
+    ONNX_NAMESPACE::FunctionProto inlined_fp;
     ORT_ENFORCE(callnode.TryGetFunctionProto(inlined_fp), "Node has no function body and cannot be inlined.");
-    function_utils::Specialize(inlined_fp, callnode, uniq_identifier);
 
-    auto to_node_arg = [this](const std::string& name) {
-      return &this->GetOrCreateNodeArg(name, nullptr);
-    };
-
-    // Process constant nodes first and create NodeArg for these as they become initializers
-    // It is important for the initializers to have NodeArg created, first they are needed
-    // if the initializer is unused and removed, second if the node depends on the initializer,
-    // we can have Type attached to it.
-    for (const auto& inlined_node : inlined_fp.node()) {
-      if (inlined_node.op_type() == kConstant) {
-        // Copy constant nodes _value to name_to_initial_tensor_
-        const gsl::not_null<TensorProto*> tensor{graph_proto_->add_initializer()};
-        ORT_RETURN_IF_ERROR(utils::ConstantNodeProtoToTensorProto(inlined_node, model_path, *tensor, inlined_node.output(0)));
-        auto insert_result = name_to_initial_tensor_.emplace(tensor->name(), tensor);
-        ORT_ENFORCE(insert_result.second, "Constant node name: ", tensor->name(), " in inlined function: ",
-                    inlined_fp.name(), " conflicts with graph initializer. Check Specializing code.");
-        TypeProto t{TypeProtoFromTensorProto(*tensor)};
-        ORT_IGNORE_RETURN_VALUE(GetOrCreateNodeArg(tensor->name(), &t));
-      }
-    }
-
-    for (const auto& inlined_node : inlined_fp.node()) {
-      if (inlined_node.op_type() != kConstant) {
-        InlinedVector<onnxruntime::NodeArg*> inputs;
-        InlinedVector<onnxruntime::NodeArg*> outputs;
-
-        for (const auto& tensor_name : inlined_node.input())
-          inputs.push_back(to_node_arg(tensor_name));
-
-        for (const auto& tensor_name : inlined_node.output())
-          outputs.push_back(to_node_arg(tensor_name));
-
-        onnxruntime::NodeAttributes new_attr_map;
-        new_attr_map.reserve(inlined_node.attribute_size());
-        for (const auto& node_attr : inlined_node.attribute()) {
-          onnx::AttributeProto attr_copy = node_attr;
-          new_attr_map[node_attr.name()] = std::move(attr_copy);
-        }
-        AddNode(inlined_node.name(), inlined_node.op_type(),
-                inlined_node.doc_string(), inputs, outputs, &new_attr_map, inlined_node.domain());
-      }
-    }
+    // Make all the names unique and resolve nested graphs inputs to the outer scope.
+    function_utils::Specialize(inlined_fp, callnode, uniq_identifier);
 
+    // In this case, global Resolve() will take care of everything.
+    ORT_RETURN_IF_ERROR(InlineFunctionProto(inlined_fp));
   } else {
     // Uncommon scenario. Inlining a node representing a fused sub-graph.
     // TODO: Unclear that this feature is needed. Can this be removed?
@@ -4115,15 +4147,7 @@ Status Graph::InlineFunction(Node& callnode) {
         // Copy constant nodes _value to name_to_initial_tensor_
         ONNX_NAMESPACE::NodeProto subgraph_node_proto{};
         subgraph_node.ToProto(subgraph_node_proto);
-        const gsl::not_null<TensorProto*> tensor{graph_proto_->add_initializer()};
-        ORT_RETURN_IF_ERROR(utils::ConstantNodeProtoToTensorProto(subgraph_node_proto, model_path, *tensor, subgraph_node_proto.output(0)));
-        auto insert_result = name_to_initial_tensor_.emplace(tensor->name(), tensor);
-        ORT_ENFORCE(insert_result.second, "Constant node name: ", tensor->name(), " in inlined subgraph: ",
-                    subgraph.Name(), " conflicts with graph initializer. Check Specializing code.");
-        if (GetNodeArg(tensor->name()) == nullptr) {
-          TypeProto t{TypeProtoFromTensorProto(*tensor)};
-          ORT_IGNORE_RETURN_VALUE(GetOrCreateNodeArg(tensor->name(), &t));
-        }
+        ORT_RETURN_IF_ERROR(AddConstantProtoAsInitializer(subgraph_node_proto, std::nullopt));
       }
     }
 
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 05747a7e5124d..076332a65c8f2 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -41,6 +41,35 @@ namespace onnxruntime {
 
 #if !defined(ORT_MINIMAL_BUILD)
 
+void Model::RemoveLocalFunctionsProtos(const InlinedHashSet<std::string>& retained) {
+  auto* local_functions = model_proto_.mutable_functions();
+  if (retained.empty()) {
+    model_local_function_templates_maps_.clear();
+    model_local_functions_.clear();
+    local_functions->erase(local_functions->begin(), local_functions->end());
+  } else {
+    const auto retained_end = retained.cend();
+    for (auto it = model_local_functions_.begin();
+         it != model_local_functions_.end();) {
+      if (retained.find(it->first) == retained_end) {
+        model_local_function_templates_maps_.erase(it->first);
+        it = model_local_functions_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+
+    for (auto it = local_functions->begin(); it != local_functions->end();) {
+      const auto function_id = function_utils::GetFunctionIdentifier(it->domain(), it->name());
+      if (retained.find(function_id) == retained_end) {
+        it = local_functions->erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+}
+
 static constexpr int DEFAULT_PROTOBUF_BLOCK_SIZE = 4 * 1024 * 1024;
 
 Model::Model(const std::string& graph_name,
@@ -95,10 +124,10 @@ Model::Model(const std::string& graph_name,
   for (auto& func : model_local_functions) {
     auto func_ptr = model_proto_.add_functions();
     func_ptr->CopyFrom(func);
-    model_local_functions_.insert_or_assign(function_utils::GetFunctionIdentifier(func_ptr->domain(), func_ptr->name()), func_ptr);
+    model_local_functions_.insert_or_assign(function_utils::GetFunctionIdentifier(func_ptr->domain(), func_ptr->name()),
+                                            func_ptr);
   }
 
-  model_local_function_templates_.reserve(model_proto_.functions().size());
   model_local_function_templates_maps_.reserve(model_proto_.functions().size());
   for (auto& func : model_proto_.functions()) {
     auto func_schema_ptr = function_utils::CreateSchema(func.domain(),
@@ -111,8 +140,8 @@ Model::Model(const std::string& graph_name,
     auto func_template_ptr = std::make_unique<FunctionTemplate>();
     func_template_ptr->op_schema_ = std::move(func_schema_ptr);
     func_template_ptr->onnx_func_proto_ = &func;
-    model_local_function_templates_.push_back(std::move(func_template_ptr));
-    model_local_function_templates_maps_[function_utils::GetFunctionIdentifier(func.domain(), func.name())] = model_local_function_templates_.back().get();
+    model_local_function_templates_maps_.insert_or_assign(function_utils::GetFunctionIdentifier(func.domain(), func.name()),
+                                                          std::move(func_template_ptr));
   }
 
   // need to call private ctor so can't use make_shared
@@ -220,7 +249,6 @@ Model::Model(ModelProto&& model_proto, const PathString& model_path,
     model_local_functions_.insert_or_assign(function_utils::GetFunctionIdentifier(func.domain(), func.name()), &func);
   }
 
-  model_local_function_templates_.reserve(model_proto_.functions().size());
   model_local_function_templates_maps_.reserve(model_proto_.functions().size());
   for (auto& func : model_proto_.functions()) {
     auto func_schema_ptr = function_utils::CreateSchema(func.domain(),
@@ -233,9 +261,7 @@ Model::Model(ModelProto&& model_proto, const PathString& model_path,
     auto func_template_ptr = std::make_unique<FunctionTemplate>();
     func_template_ptr->op_schema_ = std::move(func_schema_ptr);
     func_template_ptr->onnx_func_proto_ = &func;
-    model_local_function_templates_.push_back(std::move(func_template_ptr));
-    model_local_function_templates_maps_[function_utils::GetFunctionIdentifier(func.domain(), func.name())] =
-        model_local_function_templates_.back().get();
+    model_local_function_templates_maps_.insert_or_assign(function_utils::GetFunctionIdentifier(func.domain(), func.name()), std::move(func_template_ptr));
   }
 
   // create instance. need to call private ctor so can't use make_unique
@@ -244,7 +270,7 @@ Model::Model(ModelProto&& model_proto, const PathString& model_path,
                          logger, options.strict_shape_type_inference));
 }
 
-const InlinedHashMap<std::string, FunctionTemplate*>& Model::GetModelLocalFunctionTemplates() const {
+const NodeHashMap<std::string, std::unique_ptr<FunctionTemplate>>& Model::GetModelLocalFunctionTemplates() const {
   return model_local_function_templates_maps_;
 }
 
@@ -332,7 +358,7 @@ const Graph& Model::MainGraph() const noexcept {
 }
 
 #if !defined(ORT_MINIMAL_BUILD)
-ModelProto Model::ToProto() {
+ModelProto Model::ToProto() const {
   // We want to return back the original proto
   // To that end invoke const overload of ToGraphProto()
   // that returns by value and, therefore, allows us to filter
@@ -346,7 +372,7 @@ ModelProto Model::ToProto() {
 
 ModelProto Model::ToGraphProtoWithExternalInitializers(const std::string& external_file_name,
                                                        const PathString& file_path,
-                                                       size_t initializer_size_threshold) {
+                                                       size_t initializer_size_threshold) const {
   ModelProto result(model_proto_);
   const auto& graph = *graph_;
   *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name,
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 6bdb68dd734f0..4ce6660b794bc 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -139,7 +139,7 @@ class Model {
   // Returns empty string if not specified.
   const std::string GraphDocString() const;
 
-  const InlinedHashMap<std::string, FunctionTemplate*>& GetModelLocalFunctionTemplates() const;
+  const NodeHashMap<std::string, std::unique_ptr<FunctionTemplate>>& GetModelLocalFunctionTemplates() const;
 
 #else
   // Get model's IR version.
@@ -182,14 +182,14 @@ class Model {
 
 #if !defined(ORT_MINIMAL_BUILD)
   // Get model's serialization proto data.
-  ONNX_NAMESPACE::ModelProto ToProto();
+  ONNX_NAMESPACE::ModelProto ToProto() const;
 
   // Get model's serialization proto data.
   // Save initializer larger than the given threshold (in bytes) into an external binary file
   // with the given name. This function is useful to avoid hitting the size limit of protobuf files.
   ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::string& external_file_name,
                                                                   const PathString& file_path,
-                                                                  size_t initializer_size_threshold);
+                                                                  size_t initializer_size_threshold) const;
 
 #ifdef _WIN32
   static common::Status Save(Model& model, const std::wstring& file_path);
@@ -291,6 +291,13 @@ class Model {
   common::Status SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder,
                                  flatbuffers::Offset<onnxruntime::fbs::Model>& model) const;
 
+  /// <summary>
+  /// Frees local function definitions in the model, excluding those in the `retained` set.
+  /// Called from GraphPartitioner::InlineFunctionsAOT.
+  /// </summary>
+  /// <param name="retained">contains function IDs that should not be removed.</param>
+  void RemoveLocalFunctionsProtos(const InlinedHashSet<std::string>& retained);
+
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
   static common::Status LoadFromOrtFormat(const onnxruntime::fbs::Model& fbs_model,
@@ -312,14 +319,12 @@ class Model {
   // this map will be used for the local functions' schema's type/shape inference.
   // This container is used by ONNX code and must be an std::unordered_map.
   std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*> model_local_functions_;
-  // this is the container that host the generated schemas for model local functions.
-  // the generated schemare will be used for graph resolving and type/shape inference.
-  // those schemas' type/shape inference will reference to the model_local_functions_ as context,
-  // so need to keep them with same lifetime.
-  InlinedVector<std::unique_ptr<FunctionTemplate>> model_local_function_templates_;
   // this is the map from function id to the local function template.
   // this map will be used by graph to instantiate the function body.
-  InlinedHashMap<std::string, FunctionTemplate*> model_local_function_templates_maps_;
+  // Defined as a node based map so the memory is released when not all of the functions
+  // are inlined and removed.
+  NodeHashMap<std::string, std::unique_ptr<FunctionTemplate>> model_local_function_templates_maps_;
+
 #else
   // properties that would normally come from ModelProto
   std::string producer_version_;
diff --git a/onnxruntime/core/mlas/lib/aarch64/QgemmS8S8KernelSmmla.S b/onnxruntime/core/mlas/lib/aarch64/QgemmS8S8KernelSmmla.S
new file mode 100644
index 0000000000000..e18846c89030e
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/aarch64/QgemmS8S8KernelSmmla.S
@@ -0,0 +1,922 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    QgemmS8S8KernelSmmla.s
+
+Abstract:
+
+    This module implements the kernels for the Int8 precision matrix/matrix
+    multiply operation (QGEMM).
+
+--*/
+
+#include "asmmacro.h"
+
+        .text
+
+//
+// Stack frame layout for the smmla kernel. d8-d15, x19-x30 need save
+//
+        .equ  .LMlasQgemmKernel_backup_x19_x20,    0
+        .equ  .LMlasQgemmKernel_backup_x21_x22,    16
+        .equ  .LMlasQgemmKernel_backup_x23_x24,    32
+        .equ  .LMlasQgemmKernel_backup_x25_x26,    48
+        .equ  .LMlasQgemmKernel_backup_x27_x28,    64
+        .equ  .LMlasQgemmKernel_backup_d8_d9,      80
+        .equ  .LMlasQgemmKernel_backup_d10_d11,    96
+        .equ  .LMlasQgemmKernel_backup_d12_d13,    112
+        .equ  .LMlasQgemmKernel_backup_d14_d15,    128
+        .equ  .LMlasQgemmKernel_SavedRegisters,    144
+        .equ  .LMlasQgemmKernel_SavedRegisters_Neg, -144
+
+
+//
+// Init Row Accumulators
+//
+// Generates the code to initialize the accumulators for a single row of the output
+// block.
+//
+//
+//  Accumulators are initialized to ZeroPointB * RowSum + ColumnSum
+//  x7 for RowSumsBuffer pointer
+//  x10 for ColumnSumBuffer pointer
+//  x11 for ZeroPointB buffer pointer
+//
+//  v12~v13 for RowSums values
+//  v14~v15 for ColumnSums values
+//  v0~v3 for ZeroPointB values
+//
+        .macro  InitRowAccumulators Columns, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, RowSumReg
+
+        mul     v7.4s, v\RowSumReg\().4s, v8.4s
+        mov     v\Vec1Reg\().16b, v7.16b
+        add     v\Vec1Reg\().4s, v\Vec1Reg\().4s, v0.4s
+.if \Columns\() > 2
+        mul     v7.4s, v\RowSumReg\().4s, v9.4s
+        mov     v\Vec2Reg\().16b, v7.16b
+        add     v\Vec2Reg\().4s, v\Vec2Reg\().4s, v1.4s
+.endif
+.if \Columns\() > 4
+        mul     v7.4s, v\RowSumReg\().4s, v10.4s
+        mov     v\Vec3Reg\().16b, v7.16b
+        add     v\Vec3Reg\().4s, v\Vec3Reg\().4s, v2.4s
+.endif
+.if \Columns\() > 6
+        mul     v7.4s, v\RowSumReg\().4s, v11.4s
+        mov     v\Vec4Reg\().16b, v7.16b
+        add     v\Vec4Reg\().4s, v\Vec4Reg\().4s, v3.4s
+.endif
+
+        .endm
+
+
+//
+// InitBlockAccumulators
+//
+// Generates the code to initialize the accumulators for 8x8 output
+// block.
+//
+        .macro  InitBlockAccumulators Mode, Columns, Rows
+
+        ld1     {v14.4s},[x10],#16            // load ColumnSumBuffer[0]
+.if \Columns\() > 4
+        ld1     {v15.4s},[x10],#16            // load ColumnSumBuffer[4]
+.endif
+        // v4~v7 will be set to matrixB after this, so, they can used now
+        dup     v4.4s,v14.s[0]              // broadcast column
+        dup     v5.4s,v14.s[1]
+        dup     v6.4s,v14.s[2]
+        dup     v7.4s,v14.s[3]
+
+        zip1    v0.4s, v4.4s, v5.4s
+        zip2    v1.4s, v6.4s, v7.4s
+.if \Columns\() > 4
+        dup     v4.4s,v15.s[0]              // broadcast column
+        dup     v5.4s,v15.s[1]
+        dup     v6.4s,v15.s[2]
+        dup     v7.4s,v15.s[3]
+
+        zip1    v2.4s, v4.4s, v5.4s
+        zip2    v3.4s, v6.4s, v7.4s
+.endif
+
+        // v8~v11 will anyway get set in MatrixA loading, so they are free to use now
+        movi    v8.4s, #1
+        movi    v9.4s, #1
+        movi    v10.4s, #1
+        movi    v11.4s, #1
+
+        cbz     x11,.L\Mode\().InitBlock\Columns\().x\Rows\().SkipScaleByZeroPointB
+
+        ld1     {v4.4s},[x11],#16           // load ZeroPointB[0]
+        ld1     {v5.4s},[x11],#16           // load ZeroPointB[4]
+
+        dup     v6.4s, v4.s[0]
+        dup     v7.4s, v4.s[1]
+        zip1    v8.4s, v6.4s, v7.4s
+
+        dup     v6.4s, v4.s[2]
+        dup     v7.4s, v4.s[3]
+        zip1    v9.4s, v6.4s, v7.4s
+
+        dup     v6.4s, v5.s[0]
+        dup     v7.4s, v5.s[1]
+        zip1    v10.4s, v6.4s, v7.4s
+
+        dup     v6.4s, v5.s[2]
+        dup     v7.4s, v5.s[3]
+        zip1    v11.4s, v6.4s, v7.4s
+
+.L\Mode\().InitBlock\Columns\().x\Rows\().SkipScaleByZeroPointB:
+        dup     v4.4s, v12.s[0]           //boardcast RowSums
+        dup     v5.4s, v12.s[1]
+
+        uzp1    v6.2d, v4.2d, v5.2d
+
+        InitRowAccumulators \Columns\(),16,17,18,19,6
+.if \Rows\() > 2
+        dup     v4.4s, v12.s[2]           //boardcast RowSums
+        dup     v5.4s, v12.s[3]
+
+        uzp1    v6.2d, v4.2d, v5.2d
+
+        InitRowAccumulators \Columns\(),20,21,22,23,6
+.endif
+.if \Rows\() > 4
+        dup     v4.4s,v13.s[0]         // broadcast row sums
+        dup     v5.4s,v13.s[1]
+
+        uzp1    v6.2d, v4.2d, v5.2d
+
+        InitRowAccumulators \Columns\(),24,25,26,27,6
+.endif
+.if \Rows\() > 6
+        dup     v4.4s,v13.s[2]         // broadcast row sums
+        dup     v5.4s,v13.s[3]
+
+        uzp1    v6.2d, v4.2d, v5.2d
+        InitRowAccumulators \Columns\(),28,29,30,31,6
+.endif
+
+        .endm
+
+
+// LoadPackedMatrixABy16Elements
+//
+// Generates the code to load 16 elements from matrix A.
+//
+        .macro  LoadPackedMatrixABy16Elements Rows
+.if \Rows\() == 1
+        ldr     q8,[x0],#8
+.else
+        ldr     q8,[x0],#16
+
+.if \Rows\() > 2
+        ldr     q9,[x0],#16
+.endif
+
+.if \Rows\() > 4
+        ldr     q10,[x0],#16
+.endif
+
+.if \Rows\() > 6
+        ldr     q11,[x0],#16
+.endif
+.endif
+        .endm
+
+
+//
+// MultiplyAccumulateRow
+//
+// Generates the code to multiply and accumulate a single row of the output
+// block.
+//
+
+        .macro  MultiplyAccumulateRow Columns, MatrixAReg, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg
+
+        smmla   v\Vec1Reg\().4s, \MatrixAReg\().16b, v4.16b
+.if \Columns\() > 2
+        smmla   v\Vec2Reg\().4s, \MatrixAReg\().16b, v5.16b
+.endif
+.if \Columns\() > 4
+        smmla   v\Vec3Reg\().4s, \MatrixAReg\().16b, v6.16b
+.endif
+.if \Columns\() > 6
+        smmla   v\Vec4Reg\().4s, \MatrixAReg\().16b, v7.16b
+.endif
+
+        .endm
+
+//
+// MultiplyAccumulateBlock
+//
+// Generates the code to multiply and accumulate into the output block.
+//
+
+        .macro  MultiplyAccumulateBlock Columns, Rows
+
+        MultiplyAccumulateRow \Columns\(),v8,16,17,18,19
+.if \Rows\() > 2
+        MultiplyAccumulateRow \Columns\(),v9,20,21,22,23
+.endif
+.if \Rows\() > 4
+        MultiplyAccumulateRow \Columns\(),v10,24,25,26,27
+.endif
+.if \Rows\() > 6
+        MultiplyAccumulateRow \Columns\(),v11,28,29,30,31
+.endif
+
+        .endm
+
+//
+// ComputeBlockLoop
+//
+// Generates the code to loop over K entries of the input matrices to produce
+// the output block.
+//
+
+        .macro  ComputeBlockLoop Mode, Columns, Rows
+
+        InitBlockAccumulators \Mode\(), \Columns\(),\Rows\()
+
+        sub     x9,x3,#1                   //  block count to process
+        tbnz    x9,#63,.L\Mode\().ProcessRemaining\Columns\().x\Rows\().Blocks
+
+.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4Loop:
+
+        LoadPackedMatrixABy16Elements \Rows\()
+        ld1     {v4.16b - v7.16b}, [x1], #64
+        MultiplyAccumulateBlock \Columns\(),\Rows\()
+
+        sub     x9,x9,#1
+        tbz     x9,#63,.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4Loop
+.L\Mode\().ProcessRemaining\Columns\().x\Rows\().Blocks:
+        add     x9,x9,#1                    // correct for over-subtract above
+        cbz     x9,.L\Mode\().Output\Columns\().x\Rows\().Block
+
+.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4PaddedLoop:
+        LoadPackedMatrixABy16Elements \Rows\()
+        ld1     {v4.16b - v7.16b}, [x1], #64
+        MultiplyAccumulateBlock \Columns\(),\Rows\()
+
+.L\Mode\().Output\Columns\().x\Rows\().Block:
+
+        .endm
+
+
+//
+// OutputRow2Element
+// OutputRow4Element
+// OutputRow6Element
+// OutputRow8Element
+// OutputRow10Element
+// OutputRow12Element
+// OutputRow14Element
+// OutputRow16Element
+//
+// Generates the code to store elements to the output block.
+//
+
+        .macro  OutputRow2Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     s8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     s9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        mov     v8.S[2], v9.S[0]
+        add     v8.4s,v8.4s,v\Vec1Reg\().4s
+
+        mov     w27, v8.S[0]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     w27, v8.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        mov     w27, v\Vec1Reg\().S[0]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     w27, v\Vec1Reg\().S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow4Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     d8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     d9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+
+        mov     v8.D[1], v9.D[0]
+
+        add     v8.4s,v8.4s,v\Vec1Reg\().4s
+
+        mov     x27, v8.D[0]
+        mov     x28, v8.D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.else
+        mov     x27, v\Vec1Reg\().D[0]
+        mov     x28, v\Vec1Reg\().D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow6Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     d8,[\AddrReg1\()],#8
+        ldr     w28,[\AddrReg1\()],#-8
+        mov     v8.S[2], w28
+.if \last_row\() == 0
+        ldr     d9,[\AddrReg2\()],#8
+        ldr     w27,[\AddrReg2\()],#-8
+        mov     v9.S[2], w27
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+
+        mov     x27, v8.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v8.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     x27, v9.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v9.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        mov     x27, v4.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v4.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     x27, v5.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v5.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow8Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow10Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     w28, [\AddrReg1\()],#-16
+
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     w27,[\AddrReg2\()],#-16
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+        mov     v8.S[0], w28
+        mov     v8.S[2], w27
+
+        add     v8.4s,v8.4s,v\Vec3Reg\().4s
+
+        mov     w27, v8.S[0]
+        mov     w28, v8.S[2]
+
+        str     w27, [\AddrReg1\()],#4
+.if \last_row\() == 0
+        str     w28, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+        mov     w27, v\Vec3Reg\().S[0]
+        mov     w28, v\Vec3Reg\().S[2]
+
+        str     w27, [\AddrReg1\()],#4
+.if \last_row\() == 0
+        str     w28, [\AddrReg2\()],#4
+.endif
+.endif
+
+.endm
+
+
+        .macro  OutputRow12Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     d10,[\AddrReg1\()],#-16
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     d11,[\AddrReg2\()],#-16
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+        mov     v11.D[0],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+
+        mov     v10.D[1], v11.D[0]
+
+        add     v10.4s,v10.4s,v\Vec3Reg\().4s
+
+        mov     x27, v10.D[0]
+        mov     x28, v10.D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+        mov     x27, v\Vec3Reg\().D[0]
+        mov     x28, v\Vec3Reg\().D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+.endif
+
+        .endm
+
+       .macro  OutputRow14Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     d10,[\AddrReg1\()],#8
+        ldr     w28, [\AddrReg1\()],#-24
+        mov     v10.S[2], w28
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     d11,[\AddrReg2\()],#8
+        ldr     w27,[\AddrReg2\()],#-24
+        mov     v11.S[2], w27
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+
+        mov     v11.D[0],x27
+        mov     v11.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+        add     v10.4s,v10.4s,v6.4s
+        add     v11.4s,v11.4s,v7.4s
+
+        str     q8,[\AddrReg1\()],#16
+
+        mov     x27, v10.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v10.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+        mov     x27, v11.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v11.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+        mov     x27, v6.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v6.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+        mov     x27, v7.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v7.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+.endif
+
+        .endm
+
+
+        .macro  OutputRow16Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldp     q8,q10,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldp     q9,q11,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+
+        mov     v11.D[0],x27
+        mov     v11.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+        add     v10.4s,v10.4s,v6.4s
+        add     v11.4s,v11.4s,v7.4s
+
+        stp     q8,q10,[\AddrReg1\()],#32
+.if \last_row\() == 0
+        stp     q9,q11,[\AddrReg2\()],#32
+.endif
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        stp     q4,q6,[\AddrReg1\()],#32
+.if \last_row\() == 0
+        stp     q5,q7,[\AddrReg2\()],#32
+.endif
+.endif
+
+        .endm
+
+//
+// OutputBlock
+//
+// Generates the code to store the output block.
+//
+
+        .macro  OutputBlock Mode, Columns, Rows
+
+        OutputRow\Columns\()Element \Mode\(),x2,x13,16,17,18,19,(\Rows\() == 1)
+
+.if \Rows\() > 2
+        OutputRow\Columns\()Element \Mode\(),x14,x15,20,21,22,23,(\Rows\() == 3)
+.endif
+
+.if \Rows\() > 4
+        OutputRow\Columns\()Element \Mode\(),x16,x17,24,25,26,27,(\Rows\() == 5)
+.endif
+
+.if \Rows\() > 6
+        OutputRow\Columns\()Element \Mode\(),x18,x19,28,29,30,31,(\Rows\() == 7)
+.endif
+
+        .endm
+//
+// ProcessRows
+//
+// Generates the code to process a compute and store the output block for a
+// fixed number of rows.
+//
+
+        .macro  ProcessRows Mode, Rows
+        mov     x4,#\Rows\()                   // return number of rows handled
+        cmp     x5,#6
+        ble     .L\Mode\().ProcessNextColumnLoop6x\Rows\()
+
+.L\Mode\().ProcessNextColumnLoop8x\Rows\():
+        ComputeBlockLoop \Mode\(),8,\Rows\()
+
+        sub     x5,x5,#8
+        cmp     x5,#0
+        blt     .L\Mode\().Output14ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),16,\Rows\()
+        mov     x0,x8               // reload matrix A
+        cmp     x5,#6
+        bgt     .L\Mode\().ProcessNextColumnLoop8x\Rows\()
+        cbz     x5,.L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop6x\Rows\():
+
+        cmp     x5,#4
+        ble     .L\Mode\().ProcessNextColumnLoop4x\Rows\()
+        ComputeBlockLoop \Mode\(),6,\Rows\()
+        sub     x5,x5,#6
+        cmp     x5,#0
+        blt     .L\Mode\().Output10ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),12,\Rows\()
+        mov     x0,x8               // reload matrix A
+        cmp     x5,#4
+        bgt     .L\Mode\().ProcessNextColumnLoop6x\Rows\()
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop4x\Rows\():
+        cmp     x5,#2
+        ble     .L\Mode\().ProcessNextColumnLoop2x\Rows\()
+        ComputeBlockLoop \Mode\(),4,\Rows\()
+        sub     x5,x5,#4
+        cmp     x5,#0
+        blt     .L\Mode\().Output6ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),8,\Rows\()
+        mov     x0,x8               // reload matrix A
+        cmp     x5,#2
+        bgt     .L\Mode\().ProcessNextColumnLoop4x\Rows\()
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop2x\Rows\():
+        ComputeBlockLoop \Mode\(),2,\Rows\()
+        sub     x5,x5,#2
+        cmp     x5,#0
+        blt     .L\Mode\().Output2ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),4,\Rows\()
+        mov     x0,x8               // reload matrix A
+        cmp     x5,#2
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().Output14ElementsOnlyFor\Rows\():
+	OutputBlock \Mode\(),14,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output10ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),10,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output6ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),6,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output2ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),2,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+        .endm
+
+
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A (x0) - Supplies the address of matrix A. The matrix data has been packed
+        using MlasGemmQuantCopyPackA<MLAS_GEMM_S8S8_KERNEL_SMMLA>.
+
+    B (x1) - Supplies the address of matrix B. The matrix data has been packed
+        using MlasGemmQuantCopyPackB<MLAS_GEMM_S8S8_KERNEL_SMMLA>.
+
+    C (x2) - Supplies the address of matrix C.
+
+    PackedCountK (x3) - Supplies the number of packed columns from matrix A and
+        the number of packed rows from matrix B to iterate over.
+
+    CountM (x4) - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN (x5) - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    ldc (x6) - Supplies the first dimension of matrix C.
+
+    RowSumBuffer (x7) - Supplies the sum of each row from matrix A. These values
+        have been pre-scaled by the zero point offset of matrix B if the offset
+        is per-tensor (ZeroPointB is nullptr). Otherwise, these values must be
+        scaled by the per-column zero point offsets of matrix B. These values are
+        accumulated into every row of matrix C.
+
+    ColumnSumBuffer - Supplies the sum of each column from matrix B multiplied
+        by the zero point offset of matrix A. These values are accumulated into
+        every column of matrix C.
+
+    ZeroPointB - Optionally supplies the per-column zero point offsets of matrix
+        B, else nullptr if the matrix B is using per-tensor quantization.
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+
+       .macro  QgemmS8S8KernelSmmlaFunction Mode
+
+        FUNCTION_ENTRY MlasGemmS8S8KernelSmmla\Mode\()
+
+        ldr     x10,[sp, #0]
+        ldr     x11,[sp,#8]
+
+        stp     x19, x20, [sp, #.LMlasQgemmKernel_SavedRegisters_Neg]!
+        stp     x21, x22, [sp, #.LMlasQgemmKernel_backup_x21_x22]
+        stp     x23, x24, [sp, #.LMlasQgemmKernel_backup_x23_x24]
+        stp     x25, x26, [sp, #.LMlasQgemmKernel_backup_x25_x26]
+        stp     x27, x28, [sp, #.LMlasQgemmKernel_backup_x27_x28]
+        stp     d8, d9, [sp, #.LMlasQgemmKernel_backup_d8_d9]
+        stp     d10, d11, [sp, #.LMlasQgemmKernel_backup_d10_d11]
+        stp     d12, d13, [sp, #.LMlasQgemmKernel_backup_d12_d13]
+        stp     d14, d15, [sp, #.LMlasQgemmKernel_backup_d14_d15]
+
+        add     x13,x2,x6,lsl #2            // compute matrix C plus 1 row
+        add     x14,x13,x6,lsl #2           // compute matrix C plus 2 rows
+        add     x15,x14,x6,lsl #2           // compute matrix C plus 3 rows
+        add     x16,x15,x6,lsl #2           // compute matrix C plus 4 rows
+        add     x17,x16,x6,lsl #2           // compute matrix C plus 5 rows
+        add     x18,x17,x6,lsl #2           // compute matrix C plus 6 rows
+        add     x19,x18,x6,lsl #2           // compute matrix C plus 7 rows
+
+        mov     x8,x0                       // save matrix A
+
+//
+// Process 8 rows of the matrices.
+//
+        ld1     {v12.4s},[x7],#16            // load row sum 1 ~ 4
+        cmp     x4,#8
+        blt     .L\Mode\().ProcessCountMLessThan8
+        ld1     {v13.4s},[x7],#16            // load row sum 5 ~ 8
+        ProcessRows \Mode\(),8
+
+//
+// Restore non-volatile registers and return.
+//
+
+.L\Mode\().ExitKernel:
+        mov     x0,x4
+
+        ldp     d14, d15, [sp, #.LMlasQgemmKernel_backup_d14_d15]
+        ldp     d12, d13, [sp, #.LMlasQgemmKernel_backup_d12_d13]
+        ldp     d10, d11, [sp, #.LMlasQgemmKernel_backup_d10_d11]
+        ldp     d8, d9, [sp, #.LMlasQgemmKernel_backup_d8_d9]
+        ldp     x27, x28, [sp, #.LMlasQgemmKernel_backup_x27_x28]
+        ldp     x25, x26, [sp, #.LMlasQgemmKernel_backup_x25_x26]
+        ldp     x23, x24, [sp, #.LMlasQgemmKernel_backup_x23_x24]
+        ldp     x21, x22, [sp, #.LMlasQgemmKernel_backup_x21_x22]
+        ldp     x19, x20, [sp], #.LMlasQgemmKernel_SavedRegisters
+
+        ret
+
+//
+// Process 4 rows of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan8:
+        cmp     x4,#4
+        blt     .L\Mode\().ProcessCountMLessThan4
+        ProcessRows \Mode\(),4
+        b       .L\Mode\().ExitKernel
+
+//
+// Process 2 row of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan4:
+        cmp     x4,#2
+        blt     .L\Mode\().ProcessCountMLessThan2
+
+        ProcessRows \Mode\(),2
+        b       .L\Mode\().ExitKernel
+
+
+//
+// Process the last row of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan2:
+        ProcessRows \Mode\(),1
+        b       .L\Mode\().ExitKernel
+
+
+        .endm
+
+        QgemmS8S8KernelSmmlaFunction Zero
+        QgemmS8S8KernelSmmlaFunction Add
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/aarch64/QgemmU8X8KernelUmmla.S b/onnxruntime/core/mlas/lib/aarch64/QgemmU8X8KernelUmmla.S
new file mode 100644
index 0000000000000..baf6e21e6ff06
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/aarch64/QgemmU8X8KernelUmmla.S
@@ -0,0 +1,922 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    QgemmU8X8KernelUmmla.s
+
+Abstract:
+
+    This module implements the kernels for the Int8 precision matrix/matrix
+    multiply operation (QGEMM).
+
+--*/
+
+#include "asmmacro.h"
+
+        .text
+
+//
+// Stack frame layout for the ummla kernel. d8-d15, x19-x30 need save
+//
+        .equ  .LMlasQgemmKernel_backup_x19_x20,    0
+        .equ  .LMlasQgemmKernel_backup_x21_x22,    16
+        .equ  .LMlasQgemmKernel_backup_x23_x24,    32
+        .equ  .LMlasQgemmKernel_backup_x25_x26,    48
+        .equ  .LMlasQgemmKernel_backup_x27_x28,    64
+        .equ  .LMlasQgemmKernel_backup_d8_d9,      80
+        .equ  .LMlasQgemmKernel_backup_d10_d11,    96
+        .equ  .LMlasQgemmKernel_backup_d12_d13,    112
+        .equ  .LMlasQgemmKernel_backup_d14_d15,    128
+        .equ  .LMlasQgemmKernel_SavedRegisters,    144
+        .equ  .LMlasQgemmKernel_SavedRegisters_Neg, -144
+
+
+//
+// Init Row Accumulators
+//
+// Generates the code to initialize the accumulators for a single row of the output
+// block.
+//
+//
+//  Accumulators are initialized to ZeroPointB * RowSum + ColumnSum
+//  x7 for RowSumsBuffer pointer
+//  x10 for ColumnSumBuffer pointer
+//  x11 for ZeroPointB buffer pointer
+//
+//  v12~v13 for RowSums values
+//  v14~v15 for ColumnSums values
+//  v0~v3 for ZeroPointB values
+//
+        .macro  InitRowAccumulators Columns, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, RowSumReg
+
+        mul     v7.4s, v\RowSumReg\().4s, v8.4s
+        mov     v\Vec1Reg\().16b, v7.16b
+        add     v\Vec1Reg\().4s, v\Vec1Reg\().4s, v0.4s
+.if \Columns\() > 2
+        mul     v7.4s, v\RowSumReg\().4s, v9.4s
+        mov     v\Vec2Reg\().16b, v7.16b
+        add     v\Vec2Reg\().4s, v\Vec2Reg\().4s, v1.4s
+.endif
+.if \Columns\() > 4
+        mul     v7.4s, v\RowSumReg\().4s, v10.4s
+        mov     v\Vec3Reg\().16b, v7.16b
+        add     v\Vec3Reg\().4s, v\Vec3Reg\().4s, v2.4s
+.endif
+.if \Columns\() > 6
+        mul     v7.4s, v\RowSumReg\().4s, v11.4s
+        mov     v\Vec4Reg\().16b, v7.16b
+        add     v\Vec4Reg\().4s, v\Vec4Reg\().4s, v3.4s
+.endif
+
+        .endm
+
+
+//
+// InitBlockAccumulators
+//
+// Generates the code to initialize the accumulators for 8x8 output
+// block.
+//
+        .macro  InitBlockAccumulators Mode, Columns, Rows
+
+        ld1     {v14.4s},[x10],#16            // load ColumnSumBuffer[0]
+.if \Columns\() > 4
+        ld1     {v15.4s},[x10],#16            // load ColumnSumBuffer[4]
+.endif
+        // v4~v7 will be set to matrixB after this, so, they can used now
+        dup     v4.4s,v14.s[0]              // broadcast column
+        dup     v5.4s,v14.s[1]
+        dup     v6.4s,v14.s[2]
+        dup     v7.4s,v14.s[3]
+
+        zip1    v0.4s, v4.4s, v5.4s
+        zip2    v1.4s, v6.4s, v7.4s
+.if \Columns\() > 4
+        dup     v4.4s,v15.s[0]              // broadcast column
+        dup     v5.4s,v15.s[1]
+        dup     v6.4s,v15.s[2]
+        dup     v7.4s,v15.s[3]
+
+        zip1    v2.4s, v4.4s, v5.4s
+        zip2    v3.4s, v6.4s, v7.4s
+.endif
+
+        // v8~v11 will anyway get set in MatrixA loading, so they are free to use now
+        movi    v8.4s, #1
+        movi    v9.4s, #1
+        movi    v10.4s, #1
+        movi    v11.4s, #1
+
+        cbz     x11,.L\Mode\().InitBlock\Columns\().x\Rows\().SkipScaleByZeroPointB
+
+        ld1     {v4.4s},[x11],#16           // load ZeroPointB[0]
+        ld1     {v5.4s},[x11],#16           // load ZeroPointB[4]
+
+        dup     v6.4s, v4.s[0]
+        dup     v7.4s, v4.s[1]
+        zip1    v8.4s, v6.4s, v7.4s
+
+        dup     v6.4s, v4.s[2]
+        dup     v7.4s, v4.s[3]
+        zip1    v9.4s, v6.4s, v7.4s
+
+        dup     v6.4s, v5.s[0]
+        dup     v7.4s, v5.s[1]
+        zip1    v10.4s, v6.4s, v7.4s
+
+        dup     v6.4s, v5.s[2]
+        dup     v7.4s, v5.s[3]
+        zip1    v11.4s, v6.4s, v7.4s
+
+.L\Mode\().InitBlock\Columns\().x\Rows\().SkipScaleByZeroPointB:
+        dup     v4.4s, v12.s[0]           //boardcast RowSums
+        dup     v5.4s, v12.s[1]
+
+        uzp1    v6.2d, v4.2d, v5.2d
+
+        InitRowAccumulators \Columns\(),16,17,18,19,6
+.if \Rows\() > 2
+        dup     v4.4s, v12.s[2]           //boardcast RowSums
+        dup     v5.4s, v12.s[3]
+
+        uzp1    v6.2d, v4.2d, v5.2d
+
+        InitRowAccumulators \Columns\(),20,21,22,23,6
+.endif
+.if \Rows\() > 4
+        dup     v4.4s,v13.s[0]         // broadcast row sums
+        dup     v5.4s,v13.s[1]
+
+        uzp1    v6.2d, v4.2d, v5.2d
+
+        InitRowAccumulators \Columns\(),24,25,26,27,6
+.endif
+.if \Rows\() > 6
+        dup     v4.4s,v13.s[2]         // broadcast row sums
+        dup     v5.4s,v13.s[3]
+
+        uzp1    v6.2d, v4.2d, v5.2d
+        InitRowAccumulators \Columns\(),28,29,30,31,6
+.endif
+
+        .endm
+
+
+// LoadPackedMatrixABy16Elements
+//
+// Generates the code to load 16 elements from matrix A.
+//
+        .macro  LoadPackedMatrixABy16Elements Rows
+.if \Rows\() == 1
+        ldr     q8,[x0],#8
+.else
+        ldr     q8,[x0],#16
+
+.if \Rows\() > 2
+        ldr     q9,[x0],#16
+.endif
+
+.if \Rows\() > 4
+        ldr     q10,[x0],#16
+.endif
+
+.if \Rows\() > 6
+        ldr     q11,[x0],#16
+.endif
+.endif
+        .endm
+
+
+//
+// MultiplyAccumulateRow
+//
+// Generates the code to multiply and accumulate a single row of the output
+// block.
+//
+
+        .macro  MultiplyAccumulateRow Columns, MatrixAReg, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg
+
+        ummla   v\Vec1Reg\().4s, \MatrixAReg\().16b, v4.16b
+.if \Columns\() > 2
+        ummla   v\Vec2Reg\().4s, \MatrixAReg\().16b, v5.16b
+.endif
+.if \Columns\() > 4
+	ummla   v\Vec3Reg\().4s, \MatrixAReg\().16b, v6.16b
+.endif
+.if \Columns\() > 6
+        ummla   v\Vec4Reg\().4s, \MatrixAReg\().16b, v7.16b
+.endif
+
+        .endm
+
+//
+// MultiplyAccumulateBlock
+//
+// Generates the code to multiply and accumulate into the output block.
+//
+
+        .macro  MultiplyAccumulateBlock Columns, Rows
+
+        MultiplyAccumulateRow \Columns\(),v8,16,17,18,19
+.if \Rows\() > 2
+        MultiplyAccumulateRow \Columns\(),v9,20,21,22,23
+.endif
+.if \Rows\() > 4
+        MultiplyAccumulateRow \Columns\(),v10,24,25,26,27
+.endif
+.if \Rows\() > 6
+        MultiplyAccumulateRow \Columns\(),v11,28,29,30,31
+.endif
+
+        .endm
+
+//
+// ComputeBlockLoop
+//
+// Generates the code to loop over K entries of the input matrices to produce
+// the output block.
+//
+
+        .macro  ComputeBlockLoop Mode, Columns, Rows
+
+        InitBlockAccumulators \Mode\(), \Columns\(),\Rows\()
+
+        sub     x9,x3,#1                   //  block count to process
+        tbnz    x9,#63,.L\Mode\().ProcessRemaining\Columns\().x\Rows\().Blocks
+
+.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4Loop:
+
+        LoadPackedMatrixABy16Elements \Rows\()
+        ld1     {v4.16b - v7.16b}, [x1], #64
+        MultiplyAccumulateBlock \Columns\(),\Rows\()
+
+        sub     x9,x9,#1
+        tbz     x9,#63,.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4Loop
+.L\Mode\().ProcessRemaining\Columns\().x\Rows\().Blocks:
+        add     x9,x9,#1                    // correct for over-subtract above
+        cbz     x9,.L\Mode\().Output\Columns\().x\Rows\().Block
+
+.L\Mode\().Compute\Columns\().x\Rows\().BlockBy4PaddedLoop:
+        LoadPackedMatrixABy16Elements \Rows\()
+        ld1     {v4.16b - v7.16b}, [x1], #64
+        MultiplyAccumulateBlock \Columns\(),\Rows\()
+
+.L\Mode\().Output\Columns\().x\Rows\().Block:
+
+        .endm
+
+
+//
+// OutputRow2Element
+// OutputRow4Element
+// OutputRow6Element
+// OutputRow8Element
+// OutputRow10Element
+// OutputRow12Element
+// OutputRow14Element
+// OutputRow16Element
+//
+// Generates the code to store elements to the output block.
+//
+
+        .macro  OutputRow2Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     s8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     s9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        mov     v8.S[2], v9.S[0]
+        add     v8.4s,v8.4s,v\Vec1Reg\().4s
+
+        mov     w27, v8.S[0]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     w27, v8.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        mov     w27, v\Vec1Reg\().S[0]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov    w27, v\Vec1Reg\().S[2]
+        str    w27, [\AddrReg2\()],#4
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow4Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     d8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     d9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+
+        mov     v8.D[1], v9.D[0]
+
+        add     v8.4s,v8.4s,v\Vec1Reg\().4s
+
+        mov     x27, v8.D[0]
+        mov     x28, v8.D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.else
+        mov     x27, v\Vec1Reg\().D[0]
+        mov     x28, v\Vec1Reg\().D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow6Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     d8,[\AddrReg1\()],#8
+        ldr     w28,[\AddrReg1\()],#-8
+        mov     v8.S[2], w28
+.if \last_row\() == 0
+        ldr     d9,[\AddrReg2\()],#8
+        ldr     w27,[\AddrReg2\()],#-8
+        mov     v9.S[2], w27
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+
+        mov     x27, v8.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v8.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     x27, v9.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v9.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        mov     x27, v4.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v4.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        mov     x27, v5.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v5.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow8Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+
+.endif
+
+        .endm
+
+
+        .macro  OutputRow10Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     w28, [\AddrReg1\()],#-16
+
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     w27,[\AddrReg2\()],#-16
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+        mov     v8.S[0], w28
+        mov     v8.S[2], w27
+
+        add     v8.4s,v8.4s,v\Vec3Reg\().4s
+
+        mov     w27, v8.S[0]
+        mov     w28, v8.S[2]
+
+        str     w27, [\AddrReg1\()],#4
+.if \last_row\() == 0
+        str     w28, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+        mov     w27, v\Vec3Reg\().S[0]
+        mov     w28, v\Vec3Reg\().S[2]
+
+        str     w27, [\AddrReg1\()],#4
+.if \last_row\() == 0
+        str     w28, [\AddrReg2\()],#4
+.endif
+.endif
+
+.endm
+
+
+        .macro  OutputRow12Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     d10,[\AddrReg1\()],#-16
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     d11,[\AddrReg2\()],#-16
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+        mov     v11.D[0],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+
+        str     q8,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+.endif
+
+        mov v10.D[1], v11.D[0]
+
+        add     v10.4s,v10.4s,v\Vec3Reg\().4s
+
+        mov     x27, v10.D[0]
+        mov     x28, v10.D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+.endif
+        mov     x27, v\Vec3Reg\().D[0]
+        mov     x28, v\Vec3Reg\().D[1]
+
+        str     x27, [\AddrReg1\()],#8
+.if \last_row\() == 0
+        str     x28, [\AddrReg2\()],#8
+.endif
+.endif
+
+        .endm
+
+       .macro  OutputRow14Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldr     q8,[\AddrReg1\()],#16
+        ldr     d10,[\AddrReg1\()],#8
+        ldr     w28, [\AddrReg1\()],#-24
+        mov     v10.S[2], w28
+.if \last_row\() == 0
+        ldr     q9,[\AddrReg2\()],#16
+        ldr     d11,[\AddrReg2\()],#8
+        ldr     w27,[\AddrReg2\()],#-24
+        mov     v11.S[2], w27
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+
+        mov     v11.D[0],x27
+        mov     v11.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+        add     v10.4s,v10.4s,v6.4s
+        add     v11.4s,v11.4s,v7.4s
+
+        str     q8,[\AddrReg1\()],#16
+
+        mov     x27, v10.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v10.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        str     q9,[\AddrReg2\()],#16
+        mov     x27, v11.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v11.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        str     q4,[\AddrReg1\()],#16
+        mov     x27, v6.D[0]
+        str     x27, [\AddrReg1\()],#8
+        mov     w27, v6.S[2]
+        str     w27, [\AddrReg1\()],#4
+
+.if \last_row\() == 0
+        str     q5,[\AddrReg2\()],#16
+        mov     x27, v7.D[0]
+        str     x27, [\AddrReg2\()],#8
+        mov     w27, v7.S[2]
+        str     w27, [\AddrReg2\()],#4
+.endif
+.endif
+
+        .endm
+
+
+        .macro  OutputRow16Element Mode, AddrReg1, AddrReg2, Vec1Reg, Vec2Reg, Vec3Reg, Vec4Reg, last_row
+
+.ifeqs "\Mode\()","Add"
+        ldp     q8,q10,[\AddrReg1\()],#0
+.if \last_row\() == 0
+        ldp     q9,q11,[\AddrReg2\()],#0
+.else
+        mov     x27,#0
+        mov     v9.D[0],x27
+        mov     v9.D[1],x27
+
+        mov     v11.D[0],x27
+        mov     v11.D[1],x27
+.endif
+        uzp1    v4.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d,v\Vec1Reg\().2d,v\Vec2Reg\().2d
+
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        add     v8.4s,v8.4s,v4.4s
+        add     v9.4s,v9.4s,v5.4s
+        add     v10.4s,v10.4s,v6.4s
+        add     v11.4s,v11.4s,v7.4s
+
+        stp     q8,q10,[\AddrReg1\()],#32
+.if \last_row\() == 0
+        stp     q9,q11,[\AddrReg2\()],#32
+.endif
+.else
+        uzp1    v4.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp2    v5.2d, v\Vec1Reg\().2d,v\Vec2Reg\().2d
+        uzp1    v6.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+        uzp2    v7.2d, v\Vec3Reg\().2d,v\Vec4Reg\().2d
+
+        stp     q4,q6,[\AddrReg1\()],#32
+.if \last_row\() == 0
+        stp     q5,q7,[\AddrReg2\()],#32
+.endif
+.endif
+
+        .endm
+
+//
+// OutputBlock
+//
+// Generates the code to store the output block.
+//
+
+        .macro  OutputBlock Mode, Columns, Rows
+
+        OutputRow\Columns\()Element \Mode\(),x2,x13,16,17,18,19,(\Rows\() == 1)
+
+.if \Rows\() > 2
+        OutputRow\Columns\()Element \Mode\(),x14,x15,20,21,22,23,(\Rows\() == 3)
+.endif
+
+.if \Rows\() > 4
+        OutputRow\Columns\()Element \Mode\(),x16,x17,24,25,26,27,(\Rows\() == 5)
+.endif
+
+.if \Rows\() > 6
+        OutputRow\Columns\()Element \Mode\(),x18,x19,28,29,30,31,(\Rows\() == 7)
+.endif
+
+        .endm
+//
+// ProcessRows
+//
+// Generates the code to process a compute and store the output block for a
+// fixed number of rows.
+//
+
+        .macro  ProcessRows Mode, Rows
+        mov     x4,#\Rows\()                   // return number of rows handled
+        cmp     x5,#6
+        ble     .L\Mode\().ProcessNextColumnLoop6x\Rows\()
+
+.L\Mode\().ProcessNextColumnLoop8x\Rows\():
+        ComputeBlockLoop \Mode\(),8,\Rows\()
+
+        sub     x5,x5,#8
+        cmp     x5,#0
+        blt     .L\Mode\().Output14ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),16,\Rows\()
+        mov     x0,x8               // reload matrix A
+        cmp     x5,#6
+        bgt     .L\Mode\().ProcessNextColumnLoop8x\Rows\()
+        cbz     x5,.L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop6x\Rows\():
+
+        cmp     x5,#4
+        ble     .L\Mode\().ProcessNextColumnLoop4x\Rows\()
+        ComputeBlockLoop \Mode\(),6,\Rows\()
+        sub     x5,x5,#6
+        cmp     x5,#0
+        blt     .L\Mode\().Output10ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),12,\Rows\()
+        mov     x0,x8               // reload matrix A
+        cmp     x5,#4
+        bgt     .L\Mode\().ProcessNextColumnLoop6x\Rows\()
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop4x\Rows\():
+        cmp     x5,#2
+        ble     .L\Mode\().ProcessNextColumnLoop2x\Rows\()
+        ComputeBlockLoop \Mode\(),4,\Rows\()
+        sub     x5,x5,#4
+        cmp     x5,#0
+        blt     .L\Mode\().Output6ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),8,\Rows\()
+        mov     x0,x8               // reload matrix A
+        cmp     x5,#2
+        bgt     .L\Mode\().ProcessNextColumnLoop4x\Rows\()
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().ProcessNextColumnLoop2x\Rows\():
+        ComputeBlockLoop \Mode\(),2,\Rows\()
+        sub     x5,x5,#2
+        cmp     x5,#0
+        blt     .L\Mode\().Output2ElementsOnlyFor\Rows\()
+        OutputBlock \Mode\(),4,\Rows\()
+        mov     x0,x8               // reload matrix A
+        cmp     x5,#2
+        b       .L\Mode\().ExitKernel
+
+.L\Mode\().Output14ElementsOnlyFor\Rows\():
+	OutputBlock \Mode\(),14,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output10ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),10,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output6ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),6,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+
+.L\Mode\().Output2ElementsOnlyFor\Rows\():
+        OutputBlock \Mode\(),2,\Rows\()
+        b       .L\Mode\().ExitKernel
+
+        .endm
+
+
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A (x0) - Supplies the address of matrix A. The matrix data has been packed
+        using MlasGemmQuantCopyPackA<MLAS_GEMM_U8X8_KERNEL_UMMLA>.
+
+    B (x1) - Supplies the address of matrix B. The matrix data has been packed
+        using MlasGemmQuantCopyPackB<MLAS_GEMM_U8X8_KERNEL_UMMLA>.
+
+    C (x2) - Supplies the address of matrix C.
+
+    PackedCountK (x3) - Supplies the number of packed columns from matrix A and
+        the number of packed rows from matrix B to iterate over.
+
+    CountM (x4) - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN (x5) - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    ldc (x6) - Supplies the first dimension of matrix C.
+
+    RowSumBuffer (x7) - Supplies the sum of each row from matrix A. These values
+        have been pre-scaled by the zero point offset of matrix B if the offset
+        is per-tensor (ZeroPointB is nullptr). Otherwise, these values must be
+        scaled by the per-column zero point offsets of matrix B. These values are
+        accumulated into every row of matrix C.
+
+    ColumnSumBuffer - Supplies the sum of each column from matrix B multiplied
+        by the zero point offset of matrix A. These values are accumulated into
+        every column of matrix C.
+
+    ZeroPointB - Optionally supplies the per-column zero point offsets of matrix
+        B, else nullptr if the matrix B is using per-tensor quantization.
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+
+       .macro  QgemmU8X8KernelUmmlaFunction Mode
+
+        FUNCTION_ENTRY MlasGemmU8X8KernelUmmla\Mode\()
+
+        ldr     x10,[sp, #0]
+        ldr     x11,[sp,#8]
+
+        stp     x19, x20, [sp, #.LMlasQgemmKernel_SavedRegisters_Neg]!
+        stp     x21, x22, [sp, #.LMlasQgemmKernel_backup_x21_x22]
+        stp     x23, x24, [sp, #.LMlasQgemmKernel_backup_x23_x24]
+        stp     x25, x26, [sp, #.LMlasQgemmKernel_backup_x25_x26]
+        stp     x27, x28, [sp, #.LMlasQgemmKernel_backup_x27_x28]
+        stp     d8, d9, [sp, #.LMlasQgemmKernel_backup_d8_d9]
+        stp     d10, d11, [sp, #.LMlasQgemmKernel_backup_d10_d11]
+        stp     d12, d13, [sp, #.LMlasQgemmKernel_backup_d12_d13]
+        stp     d14, d15, [sp, #.LMlasQgemmKernel_backup_d14_d15]
+
+        add     x13,x2,x6,lsl #2            // compute matrix C plus 1 row
+        add     x14,x13,x6,lsl #2           // compute matrix C plus 2 rows
+        add     x15,x14,x6,lsl #2           // compute matrix C plus 3 rows
+        add     x16,x15,x6,lsl #2           // compute matrix C plus 4 rows
+        add     x17,x16,x6,lsl #2           // compute matrix C plus 5 rows
+        add     x18,x17,x6,lsl #2           // compute matrix C plus 6 rows
+        add     x19,x18,x6,lsl #2           // compute matrix C plus 7 rows
+
+        mov     x8,x0                       // save matrix A
+
+//
+// Process 8 rows of the matrices.
+//
+        ld1     {v12.4s},[x7],#16            // load row sum 1 ~ 4
+        cmp     x4,#8
+        blt     .L\Mode\().ProcessCountMLessThan8
+        ld1     {v13.4s},[x7],#16            // load row sum 5 ~ 8
+        ProcessRows \Mode\(),8
+
+//
+// Restore non-volatile registers and return.
+//
+
+.L\Mode\().ExitKernel:
+        mov     x0,x4
+
+        ldp     d14, d15, [sp, #.LMlasQgemmKernel_backup_d14_d15]
+        ldp     d12, d13, [sp, #.LMlasQgemmKernel_backup_d12_d13]
+        ldp     d10, d11, [sp, #.LMlasQgemmKernel_backup_d10_d11]
+        ldp     d8, d9, [sp, #.LMlasQgemmKernel_backup_d8_d9]
+        ldp     x27, x28, [sp, #.LMlasQgemmKernel_backup_x27_x28]
+        ldp     x25, x26, [sp, #.LMlasQgemmKernel_backup_x25_x26]
+        ldp     x23, x24, [sp, #.LMlasQgemmKernel_backup_x23_x24]
+        ldp     x21, x22, [sp, #.LMlasQgemmKernel_backup_x21_x22]
+        ldp     x19, x20, [sp], #.LMlasQgemmKernel_SavedRegisters
+
+        ret
+
+//
+// Process 4 rows of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan8:
+        cmp     x4,#4
+        blt     .L\Mode\().ProcessCountMLessThan4
+        ProcessRows \Mode\(),4
+        b       .L\Mode\().ExitKernel
+
+//
+// Process 2 row of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan4:
+        cmp     x4,#2
+        blt     .L\Mode\().ProcessCountMLessThan2
+
+        ProcessRows \Mode\(),2
+        b       .L\Mode\().ExitKernel
+
+
+//
+// Process the last row of the matrix.
+//
+
+.L\Mode\().ProcessCountMLessThan2:
+        ProcessRows \Mode\(),1
+        b       .L\Mode\().ExitKernel
+
+
+        .endm
+
+        QgemmU8X8KernelUmmlaFunction Zero
+        QgemmU8X8KernelUmmlaFunction Add
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index b6ac4a1ca1d6c..2fdb0dda5d25c 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -184,11 +184,17 @@ class MLASCPUIDInfo
 
     bool IsCurrentCoreArmv8NarrowLd() const { return false; }
 
+    bool HasArmNeon_I8MM() const { return has_arm_neon_i8mm_; }
+
+    bool HasArmSVE_I8MM() const { return has_arm_sve_i8mm_; }
+
    private:
     MLASCPUIDInfo();
 
     bool has_arm_neon_dot_{false};
     bool has_fp16_{false};
+    bool has_arm_neon_i8mm_{false};
+    bool has_arm_sve_i8mm_{false};
 };
 using MLAS_CPUIDINFO = MLASCPUIDInfo;
 
@@ -856,6 +862,8 @@ extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchNeon;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmX8S8DispatchNeon;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchUdot;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmS8S8DispatchSdot;
+extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchUmmla;
+extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmS8S8DispatchSmmla;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchWasmSimd;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmQuantDispatchDefault;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemm8X8DispatchPOWER10;
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 3c0f82408179b..39586282e00ad 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -52,6 +52,14 @@ MLASCPUIDInfo::MLASCPUIDInfo()
 #define HWCAP_ASIMDDP (1 << 20)
 #endif
 
+#ifndef HWCAP2_I8MM
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
+#ifndef HWCAP2_SVEI8MM
+#define HWCAP2_SVEI8MM (1 << 9)
+#endif
+
 #if defined(BUILD_MLAS_NO_ONNXRUNTIME)
 MLASCPUIDInfo::MLASCPUIDInfo()
 {
@@ -59,6 +67,9 @@ MLASCPUIDInfo::MLASCPUIDInfo()
 
     // raw hack! Need CPUIDInfo implementation for more precise detection
     has_fp16_ = has_arm_neon_dot_;
+
+    has_arm_neon_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_I8MM) != 0);
+    has_arm_sve_i8mm_ = ((getauxval(AT_HWCAP2) & HWCAP2_SVEI8MM) != 0);
 }
 #endif
 
@@ -480,6 +491,17 @@ Return Value:
         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
     }
 
+#if defined(__linux__)
+    //
+    // Check if the processor supports ASIMD I8MM instructions.
+    //
+    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()) {
+        this->GemmU8U8Dispatch = &MlasGemmU8X8DispatchUmmla;
+        this->GemmU8S8Dispatch = &MlasGemmU8X8DispatchUmmla;
+        this->GemmS8S8Dispatch = &MlasGemmS8S8DispatchSmmla;
+    }
+#endif
+
 #endif // MLAS_TARGET_ARM64
 #if defined(MLAS_TARGET_POWER)
     this->GemmFloatKernel = MlasSgemmKernel;
diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_smmla.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_smmla.cpp
new file mode 100644
index 0000000000000..c41f43ca22d18
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_smmla.cpp
@@ -0,0 +1,964 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    qgemm_kernel_smmla.cpp
+
+Abstract:
+
+    This module implements smmla QGEMM kernel.
+
+--*/
+
+#include "mlasi.h"
+#include "qgemm.h"
+
+//
+// Define the prototypes of the NEON SMMLA routines written in assembly.
+//
+
+extern "C" {
+
+size_t MLASCALL
+MlasGemmS8S8KernelSmmlaZero(const uint8_t* A,
+                            const uint8_t* B,
+                            int32_t* C,
+                            size_t PackedCountK,
+                            size_t CountM,
+                            size_t CountN,
+                            size_t ldc,
+                            const int32_t* RowSumVector,
+                            const int32_t* ColumnSumVector,
+                            const int32_t* ZeroPointB);
+
+size_t MLASCALL
+MlasGemmS8S8KernelSmmlaAdd(const uint8_t* A,
+                           const uint8_t* B,
+                           int32_t* C,
+                           size_t PackedCountK,
+                           size_t CountM,
+                           size_t CountN,
+                           size_t ldc,
+                           const int32_t* RowSumVector,
+                           const int32_t* ColumnSumVector,
+                           const int32_t* ZeroPointB);
+}
+
+struct MLAS_GEMM_S8S8_KERNEL_SMMLA {
+    typedef uint8_t PackedAType;
+    typedef uint8_t PackedBType;
+    typedef int8_t OffsetAType;
+    typedef int8_t OffsetBType;
+
+    static constexpr size_t PackedK = 8;
+    static constexpr MLAS_GEMM_QUANT_STRIDES Strides{24, 128, 256};
+    static constexpr MLAS_GEMM_QUANT_STRIDES PackedStrides{24, 128, 384};
+};
+
+constexpr size_t MLAS_GEMM_S8S8_KERNEL_SMMLA::PackedK;
+constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_S8S8_KERNEL_SMMLA::Strides;
+constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_S8S8_KERNEL_SMMLA::PackedStrides;
+
+template <>
+MLAS_FORCEINLINE int32_t
+MlasGemmQuantFixupZeroPointB<MLAS_GEMM_S8S8_KERNEL_SMMLA>(int32_t ZeroPointB, bool BIsSigned)
+{
+    MLAS_UNREFERENCED_PARAMETER(BIsSigned);
+    return ZeroPointB;
+}
+
+template <>
+void
+MlasGemmQuantCopyPackA<MLAS_GEMM_S8S8_KERNEL_SMMLA>(
+    MLAS_GEMM_S8S8_KERNEL_SMMLA::PackedAType* D_uint8_t,
+    const uint8_t* A,
+    size_t lda,
+    size_t CountM,
+    size_t CountK,
+    int32_t* RowSumBuffer,
+    bool AIsSigned)
+{
+    int8_t* D = reinterpret_cast<int8_t*>(D_uint8_t);
+    MLAS_UNREFERENCED_PARAMETER(AIsSigned);
+    int8_t PaddedMatrixAData[64];
+
+    //
+    // Process 8 rows of matrix A.
+    //
+    // MMLA kernels load 8x8 block of A with four vector registers. So A is packed
+    // a series of 64 byte vectors where eight rows are interleaved with the
+    // following pattern:
+    //
+    //      [ A0 A1 A2 A3 A4 A5 A6 A7 ]
+    //      [ B0 B1 B2 B3 B4 B5 B6 B7 ]
+    //      [ C0 C1 C2 C3 C4 C5 C6 C7 ]
+    //      [ D0 D1 D2 D3 D4 D5 D6 D7 ]
+    //      [ E0 E1 E2 E3 E4 E5 E6 E7 ]
+    //      [ F0 F1 F2 F3 F4 F5 F6 F7 ]
+    //      [ G0 G1 G2 G3 G4 G5 G6 G7 ]
+    //      [ H0 H1 H2 H3 H4 H5 H6 H7 ]
+    //
+    //      ...
+    //
+    // This pattern is repeated (CountK / 8) times.
+    //
+    // If CountK is not aligned to a multiple of eight, then the vector is padded
+    // with zeroes.
+    //
+
+    while (CountM >= 8) {
+        const int8_t* a0 = reinterpret_cast<const int8_t*>(A);
+        const int8_t* a1 = a0 + lda;
+        const int8_t* a2 = a0 + lda * 2;
+        const int8_t* a3 = a0 + lda * 3;
+        const int8_t* a4 = a0 + lda * 4;
+        const int8_t* a5 = a0 + lda * 5;
+        const int8_t* a6 = a0 + lda * 6;
+        const int8_t* a7 = a0 + lda * 7;
+
+        size_t k = CountK;
+        int32x4_t RowSums0 = vmovq_n_s32(0);
+        int32x4_t RowSums1 = vmovq_n_s32(0);
+
+        while (k >= 16) {
+            int64x2_t v0 = vld1q_s64(reinterpret_cast<const int64_t*>(a0));
+            a0 += 16;
+            int64x2_t v1 = vld1q_s64(reinterpret_cast<const int64_t*>(a1));
+            a1 += 16;
+            int64x2_t v2 = vld1q_s64(reinterpret_cast<const int64_t*>(a2));
+            a2 += 16;
+            int64x2_t v3 = vld1q_s64(reinterpret_cast<const int64_t*>(a3));
+            a3 += 16;
+            int64x2_t v4 = vld1q_s64(reinterpret_cast<const int64_t*>(a4));
+            a4 += 16;
+            int64x2_t v5 = vld1q_s64(reinterpret_cast<const int64_t*>(a5));
+            a5 += 16;
+            int64x2_t v6 = vld1q_s64(reinterpret_cast<const int64_t*>(a6));
+            a6 += 16;
+            int64x2_t v7 = vld1q_s64(reinterpret_cast<const int64_t*>(a7));
+            a7 += 16;
+
+            int64x2_t z0 = vzip1q_s64(v0, v1);
+            int64x2_t z1 = vzip2q_s64(v0, v1);
+            int64x2_t z2 = vzip1q_s64(v2, v3);
+            int64x2_t z3 = vzip2q_s64(v2, v3);
+
+            int64x2_t z4 = vzip1q_s64(v4, v5);
+            int64x2_t z5 = vzip2q_s64(v4, v5);
+            int64x2_t z6 = vzip1q_s64(v6, v7);
+            int64x2_t z7 = vzip2q_s64(v6, v7);
+
+            vst1q_s8(&D[0], vreinterpretq_s8_s64(z0));
+            vst1q_s8(&D[16], vreinterpretq_s8_s64(z2));
+            vst1q_s8(&D[32], vreinterpretq_s8_s64(z4));
+            vst1q_s8(&D[48], vreinterpretq_s8_s64(z6));
+            vst1q_s8(&D[64], vreinterpretq_s8_s64(z1));
+            vst1q_s8(&D[80], vreinterpretq_s8_s64(z3));
+            vst1q_s8(&D[96], vreinterpretq_s8_s64(z5));
+            vst1q_s8(&D[112], vreinterpretq_s8_s64(z7));
+
+            int32x4_t RowSums0L_pada = vmovq_n_s32(0);
+            RowSums0L_pada = vpadalq_s16(RowSums0L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z0)));
+            RowSums0L_pada = vpadalq_s16(RowSums0L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z1)));
+
+            int32x4_t RowSums0L_ext = vextq_s32(RowSums0L_pada, RowSums0L_pada, 1);
+            int32x4_t RowSums0L_add = vaddq_s32(RowSums0L_pada, RowSums0L_ext);
+            int32x2_t RowSums0L = {vdups_laneq_s32(RowSums0L_add, 0),
+                                   vdups_laneq_s32(RowSums0L_add, 2)};
+
+            int32x4_t RowSums0H_pada = vmovq_n_s32(0);
+            RowSums0H_pada = vpadalq_s16(RowSums0H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z2)));
+            RowSums0H_pada = vpadalq_s16(RowSums0H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z3)));
+
+            int32x4_t RowSums0H_ext = vextq_s32(RowSums0H_pada, RowSums0H_pada, 1);
+            int32x4_t RowSums0H_add = vaddq_s32(RowSums0H_pada, RowSums0H_ext);
+            int32x2_t RowSums0H = {vdups_laneq_s32(RowSums0H_add, 0),
+                                   vdups_laneq_s32(RowSums0H_add, 2)};
+
+            RowSums0 = vaddq_s32(RowSums0, vcombine_s32(RowSums0L, RowSums0H));
+
+            int32x4_t RowSums1L_pada = vmovq_n_s32(0);
+            RowSums1L_pada = vpadalq_s16(RowSums1L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z4)));
+            RowSums1L_pada = vpadalq_s16(RowSums1L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z5)));
+
+            int32x4_t RowSums1L_ext = vextq_s32(RowSums1L_pada, RowSums1L_pada, 1);
+            int32x4_t RowSums1L_add = vaddq_s32(RowSums1L_pada, RowSums1L_ext);
+            int32x2_t RowSums1L = {vdups_laneq_s32(RowSums1L_add, 0),
+                                   vdups_laneq_s32(RowSums1L_add, 2)};
+
+            int32x4_t RowSums1H_pada = vmovq_n_s32(0);
+            RowSums1H_pada = vpadalq_s16(RowSums1H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z6)));
+            RowSums1H_pada = vpadalq_s16(RowSums1H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z7)));
+
+            int32x4_t RowSums1H_ext = vextq_s32(RowSums1H_pada, RowSums1H_pada, 1);
+            int32x4_t RowSums1H_add = vaddq_s32(RowSums1H_pada, RowSums1H_ext);
+            int32x2_t RowSums1H = {vdups_laneq_s32(RowSums1H_add, 0),
+                                   vdups_laneq_s32(RowSums1H_add, 2)};
+
+            RowSums1 = vaddq_s32(RowSums1, vcombine_s32(RowSums1L, RowSums1H));
+
+            D += 128;
+            k -= 16;
+        }
+
+        while (k >= 8) {
+            int64x1_t v0 = *reinterpret_cast<const int64x1_t*>(a0);
+            a0 += 8;
+            int64x1_t v1 = *reinterpret_cast<const int64x1_t*>(a1);
+            a1 += 8;
+            int64x1_t v2 = *reinterpret_cast<const int64x1_t*>(a2);
+            a2 += 8;
+            int64x1_t v3 = *reinterpret_cast<const int64x1_t*>(a3);
+            a3 += 8;
+            int64x1_t v4 = *reinterpret_cast<const int64x1_t*>(a4);
+            a4 += 8;
+            int64x1_t v5 = *reinterpret_cast<const int64x1_t*>(a5);
+            a5 += 8;
+            int64x1_t v6 = *reinterpret_cast<const int64x1_t*>(a6);
+            a6 += 8;
+            int64x1_t v7 = *reinterpret_cast<const int64x1_t*>(a7);
+            a7 += 8;
+
+            *reinterpret_cast<int64x1_t*>(&D[0]) = v0;
+            *reinterpret_cast<int64x1_t*>(&D[8]) = v1;
+            *reinterpret_cast<int64x1_t*>(&D[16]) = v2;
+            *reinterpret_cast<int64x1_t*>(&D[24]) = v3;
+            *reinterpret_cast<int64x1_t*>(&D[32]) = v4;
+            *reinterpret_cast<int64x1_t*>(&D[40]) = v5;
+            *reinterpret_cast<int64x1_t*>(&D[48]) = v6;
+            *reinterpret_cast<int64x1_t*>(&D[56]) = v7;
+
+            int64x2_t z01 = vcombine_s64(v0, v1);
+            int64x2_t z23 = vcombine_s64(v2, v3);
+            int64x2_t z45 = vcombine_s64(v4, v5);
+            int64x2_t z67 = vcombine_s64(v6, v7);
+
+            int32x4_t RowSums0L_pada = vmovq_n_s32(0);
+            RowSums0L_pada = vpadalq_s16(RowSums0L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z01)));
+
+            int32x4_t RowSums0L_ext = vextq_s32(RowSums0L_pada, RowSums0L_pada, 1);
+            int32x4_t RowSums0L_add = vaddq_s32(RowSums0L_pada, RowSums0L_ext);
+            int32x2_t RowSums0L = {vdups_laneq_s32(RowSums0L_add, 0),
+                                   vdups_laneq_s32(RowSums0L_add, 2)};
+
+            int32x4_t RowSums0H_pada = vmovq_n_s32(0);
+            RowSums0H_pada = vpadalq_s16(RowSums0H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z23)));
+
+            int32x4_t RowSums0H_ext = vextq_s32(RowSums0H_pada, RowSums0H_pada, 1);
+            int32x4_t RowSums0H_add = vaddq_s32(RowSums0H_pada, RowSums0H_ext);
+            int32x2_t RowSums0H = {vdups_laneq_s32(RowSums0H_add, 0),
+                                   vdups_laneq_s32(RowSums0H_add, 2)};
+
+            RowSums0 = vaddq_s32(RowSums0, vcombine_s32(RowSums0L, RowSums0H));
+
+            int32x4_t RowSums1L_pada = vmovq_n_s32(0);
+            RowSums1L_pada = vpadalq_s16(RowSums1L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z45)));
+
+            int32x4_t RowSums1L_ext = vextq_s32(RowSums1L_pada, RowSums1L_pada, 1);
+            int32x4_t RowSums1L_add = vaddq_s32(RowSums1L_pada, RowSums1L_ext);
+            int32x2_t RowSums1L = {vdups_laneq_s32(RowSums1L_add, 0),
+                                   vdups_laneq_s32(RowSums1L_add, 2)};
+
+            int32x4_t RowSums1H_pada = vmovq_n_s32(0);
+            RowSums1H_pada = vpadalq_s16(RowSums1H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z67)));
+
+            int32x4_t RowSums1H_ext = vextq_s32(RowSums1H_pada, RowSums1H_pada, 1);
+            int32x4_t RowSums1H_add = vaddq_s32(RowSums1H_pada, RowSums1H_ext);
+            int32x2_t RowSums1H = {vdups_laneq_s32(RowSums1H_add, 0),
+                                   vdups_laneq_s32(RowSums1H_add, 2)};
+
+            RowSums1 = vaddq_s32(RowSums1, vcombine_s32(RowSums1L, RowSums1H));
+
+            D += 64;
+            k -= 8;
+        }
+
+        if (k > 0) {
+            //
+            // zero pad the remaining columns to 8
+            //
+            int8_t* d = D;
+
+            vst1q_s8(d, vmovq_n_s8(0));
+            vst1q_s8(&d[16], vmovq_n_s8(0));
+            vst1q_s8(&d[32], vmovq_n_s8(0));
+            vst1q_s8(&d[48], vmovq_n_s8(0));
+
+            while (k > 0) {
+                d[0] = *a0++;
+                d[8] = *a1++;
+                d[16] = *a2++;
+                d[24] = *a3++;
+                d[32] = *a4++;
+                d[40] = *a5++;
+                d[48] = *a6++;
+                d[56] = *a7++;
+                d += 1;
+                k -= 1;
+            }
+            d = D;
+            int64x1_t v0 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v1 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v2 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v3 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v4 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v5 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v6 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v7 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+
+            int64x2_t z01 = vcombine_s64(v0, v1);
+            int64x2_t z23 = vcombine_s64(v2, v3);
+            int64x2_t z45 = vcombine_s64(v4, v5);
+            int64x2_t z67 = vcombine_s64(v6, v7);
+
+            int32x4_t RowSums0L_pada = vmovq_n_s32(0);
+            RowSums0L_pada = vpadalq_s16(RowSums0L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z01)));
+
+            int32x4_t RowSums0L_ext = vextq_s32(RowSums0L_pada, RowSums0L_pada, 1);
+            int32x4_t RowSums0L_add = vaddq_s32(RowSums0L_pada, RowSums0L_ext);
+            int32x2_t RowSums0L = {vdups_laneq_s32(RowSums0L_add, 0),
+                                   vdups_laneq_s32(RowSums0L_add, 2)};
+
+            int32x4_t RowSums0H_pada = vmovq_n_s32(0);
+            RowSums0H_pada = vpadalq_s16(RowSums0H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z23)));
+
+            int32x4_t RowSums0H_ext = vextq_s32(RowSums0H_pada, RowSums0H_pada, 1);
+            int32x4_t RowSums0H_add = vaddq_s32(RowSums0H_pada, RowSums0H_ext);
+            int32x2_t RowSums0H = {vdups_laneq_s32(RowSums0H_add, 0),
+                                   vdups_laneq_s32(RowSums0H_add, 2)};
+
+            RowSums0 = vaddq_s32(RowSums0, vcombine_s32(RowSums0L, RowSums0H));
+
+            int32x4_t RowSums1L_pada = vmovq_n_s32(0);
+            RowSums1L_pada = vpadalq_s16(RowSums1L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z45)));
+
+            int32x4_t RowSums1L_ext = vextq_s32(RowSums1L_pada, RowSums1L_pada, 1);
+            int32x4_t RowSums1L_add = vaddq_s32(RowSums1L_pada, RowSums1L_ext);
+            int32x2_t RowSums1L = {vdups_laneq_s32(RowSums1L_add, 0),
+                                   vdups_laneq_s32(RowSums1L_add, 2)};
+
+            int32x4_t RowSums1H_pada = vmovq_n_s32(0);
+            RowSums1H_pada = vpadalq_s16(RowSums1H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z67)));
+
+            int32x4_t RowSums1H_ext = vextq_s32(RowSums1H_pada, RowSums1H_pada, 1);
+            int32x4_t RowSums1H_add = vaddq_s32(RowSums1H_pada, RowSums1H_ext);
+            int32x2_t RowSums1H = {vdups_laneq_s32(RowSums1H_add, 0),
+                                   vdups_laneq_s32(RowSums1H_add, 2)};
+
+            RowSums1 = vaddq_s32(RowSums1, vcombine_s32(RowSums1L, RowSums1H));
+
+            D += 64;
+        }
+
+        vst1q_s32(RowSumBuffer, RowSums0);
+        vst1q_s32(&RowSumBuffer[4], RowSums1);
+
+        RowSumBuffer += 8;
+
+        A = A + lda * 8;
+        CountM -= 8;
+    }
+
+    //
+    // Process four rows of matrix A.
+    //
+    // The buffer is packed as a series of 32 byte vectors where four rows are
+    // interleaved with the following pattern:
+    //
+    //      [ A0 A1 A2 A3 A4 A5 A6 A7 ]
+    //      [ B0 B1 B2 B3 B4 B5 B6 B7 ]
+    //      [ C0 C1 C2 C3 C4 C5 C6 C7 ]
+    //      [ D0 D1 D2 D3 D4 D5 D6 D7 ]
+    //
+    // This pattern is repeated (CountK / 8) times.
+    //
+    // If CountK is not aligned to a multiple of eight, then the vector is padded
+    // with zeroes.
+    //
+
+    if (CountM >= 4) {
+        const int8_t* a0 = reinterpret_cast<const int8_t*>(A);
+        const int8_t* a1 = a0 + lda;
+        const int8_t* a2 = a1 + lda;
+        const int8_t* a3 = a2 + lda;
+
+        size_t k = CountK;
+        int32x4_t RowSums = vmovq_n_s32(0);
+
+        while (k >= 16) {
+            int64x2_t v0 = vld1q_s64(reinterpret_cast<const int64_t*>(a0));
+            a0 += 16;
+            int64x2_t v1 = vld1q_s64(reinterpret_cast<const int64_t*>(a1));
+            a1 += 16;
+            int64x2_t v2 = vld1q_s64(reinterpret_cast<const int64_t*>(a2));
+            a2 += 16;
+            int64x2_t v3 = vld1q_s64(reinterpret_cast<const int64_t*>(a3));
+            a3 += 16;
+
+            int64x2_t z0 = vzip1q_s64(v0, v1);
+            int64x2_t z1 = vzip2q_s64(v0, v1);
+            int64x2_t z2 = vzip1q_s64(v2, v3);
+            int64x2_t z3 = vzip2q_s64(v2, v3);
+
+            vst1q_s8(&D[0], vreinterpretq_s8_s64(z0));
+            vst1q_s8(&D[16], vreinterpretq_s8_s64(z2));
+            vst1q_s8(&D[32], vreinterpretq_s8_s64(z1));
+            vst1q_s8(&D[48], vreinterpretq_s8_s64(z3));
+
+            int32x4_t RowSumsL_pada = vmovq_n_s32(0);
+            RowSumsL_pada = vpadalq_s16(RowSumsL_pada, vpaddlq_s8(vreinterpretq_s8_s64(z0)));
+            RowSumsL_pada = vpadalq_s16(RowSumsL_pada, vpaddlq_s8(vreinterpretq_s8_s64(z1)));
+
+            int32x4_t RowSumsL_ext = vextq_s32(RowSumsL_pada, RowSumsL_pada, 1);
+            int32x4_t RowSumsL_add = vaddq_s32(RowSumsL_pada, RowSumsL_ext);
+            int32x2_t RowSumsL = {vdups_laneq_s32(RowSumsL_add, 0),
+                                  vdups_laneq_s32(RowSumsL_add, 2)};
+
+            int32x4_t RowSumsH_pada = vmovq_n_s32(0);
+            RowSumsH_pada = vpadalq_s16(RowSumsH_pada, vpaddlq_s8(vreinterpretq_s8_s64(z2)));
+            RowSumsH_pada = vpadalq_s16(RowSumsH_pada, vpaddlq_s8(vreinterpretq_s8_s64(z3)));
+
+            int32x4_t RowSumsH_ext = vextq_s32(RowSumsH_pada, RowSumsH_pada, 1);
+            int32x4_t RowSumsH_add = vaddq_s32(RowSumsH_pada, RowSumsH_ext);
+            int32x2_t RowSumsH = {vdups_laneq_s32(RowSumsH_add, 0),
+                                  vdups_laneq_s32(RowSumsH_add, 2)};
+
+            RowSums = vaddq_s32(RowSums, vcombine_s32(RowSumsL, RowSumsH));
+
+            D += 64;
+            k -= 16;
+        }
+
+        while (k >= 8) {
+            int64x1_t v0 = *reinterpret_cast<const int64x1_t*>(a0);
+            a0 += 8;
+            int64x1_t v1 = *reinterpret_cast<const int64x1_t*>(a1);
+            a1 += 8;
+            int64x1_t v2 = *reinterpret_cast<const int64x1_t*>(a2);
+            a2 += 8;
+            int64x1_t v3 = *reinterpret_cast<const int64x1_t*>(a3);
+            a3 += 8;
+
+            *reinterpret_cast<int64x1_t*>(&D[0]) = v0;
+            *reinterpret_cast<int64x1_t*>(&D[8]) = v1;
+            *reinterpret_cast<int64x1_t*>(&D[16]) = v2;
+            *reinterpret_cast<int64x1_t*>(&D[24]) = v3;
+
+            int64x2_t z01 = vcombine_s64(v0, v1);
+            int64x2_t z23 = vcombine_s64(v2, v3);
+
+            int32x4_t RowSumsL_pada = vmovq_n_s32(0);
+            RowSumsL_pada = vpadalq_s16(RowSumsL_pada, vpaddlq_s8(vreinterpretq_s8_s64(z01)));
+
+            int32x4_t RowSumsL_ext = vextq_s32(RowSumsL_pada, RowSumsL_pada, 1);
+            int32x4_t RowSumsL_add = vaddq_s32(RowSumsL_pada, RowSumsL_ext);
+            int32x2_t RowSumsL = {vdups_laneq_s32(RowSumsL_add, 0),
+                                  vdups_laneq_s32(RowSumsL_add, 2)};
+
+            int32x4_t RowSumsH_pada = vmovq_n_s32(0);
+            RowSumsH_pada = vpadalq_s16(RowSumsH_pada, vpaddlq_s8(vreinterpretq_s8_s64(z23)));
+
+            int32x4_t RowSumsH_ext = vextq_s32(RowSumsH_pada, RowSumsH_pada, 1);
+            int32x4_t RowSumsH_add = vaddq_s32(RowSumsH_pada, RowSumsH_ext);
+            int32x2_t RowSumsH = {vdups_laneq_s32(RowSumsH_add, 0),
+                                  vdups_laneq_s32(RowSumsH_add, 2)};
+
+            RowSums = vaddq_s32(RowSums, vcombine_s32(RowSumsL, RowSumsH));
+
+            D += 32;
+            k -= 8;
+        }
+
+        if (k > 0) {
+            //
+            // Copy the remaining bytes with zero padding.
+            //
+            int8_t* d = D;
+
+            vst1q_s8(d, vmovq_n_s8(0));
+            vst1q_s8(&d[16], vmovq_n_s8(0));
+
+            while (k > 0) {
+                d[0] = *a0++;
+                d[8] = *a1++;
+                d[16] = *a2++;
+                d[24] = *a3++;
+                d += 1;
+                k -= 1;
+            }
+
+            d = D;
+            int64x1_t v0 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v1 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v2 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v3 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+
+            int64x2_t z01 = vcombine_s64(v0, v1);
+            int64x2_t z23 = vcombine_s64(v2, v3);
+
+            int32x4_t RowSums0L_pada = vmovq_n_s32(0);
+            RowSums0L_pada = vpadalq_s16(RowSums0L_pada, vpaddlq_s8(vreinterpretq_s8_s64(z01)));
+
+            int32x4_t RowSums0L_ext = vextq_s32(RowSums0L_pada, RowSums0L_pada, 1);
+            int32x4_t RowSums0L_add = vaddq_s32(RowSums0L_pada, RowSums0L_ext);
+            int32x2_t RowSums0L = {vdups_laneq_s32(RowSums0L_add, 0),
+                                   vdups_laneq_s32(RowSums0L_add, 2)};
+
+            int32x4_t RowSums0H_pada = vmovq_n_s32(0);
+            RowSums0H_pada = vpadalq_s16(RowSums0H_pada, vpaddlq_s8(vreinterpretq_s8_s64(z23)));
+
+            int32x4_t RowSums0H_ext = vextq_s32(RowSums0H_pada, RowSums0H_pada, 1);
+            int32x4_t RowSums0H_add = vaddq_s32(RowSums0H_pada, RowSums0H_ext);
+            int32x2_t RowSums0H = {vdups_laneq_s32(RowSums0H_add, 0),
+                                   vdups_laneq_s32(RowSums0H_add, 2)};
+
+            RowSums = vaddq_s32(RowSums, vcombine_s32(RowSums0L, RowSums0H));
+
+            D += 32;
+        }
+
+        vst1q_s32(RowSumBuffer, RowSums);
+        RowSumBuffer += 4;
+
+        A = A + lda * 4;
+        CountM -= 4;
+    }
+
+    //
+    // Process two rows of matrix A.
+    //
+    // The buffer is packed as a series of 16 byte vectors where two rows are
+    // interleaved with the following pattern:
+    //
+    //      [ A0 A1 A2 A3 A4 A5 A6 A7 ]
+    //      [ B0 B1 B2 B3 B4 B5 B6 B7 ]
+    //
+    // This pattern is repeated (CountK / 8) times.
+    //
+    // If CountK is not aligned to a multiple of eight, then the vector is padded
+    // with zeroes.
+    //
+
+    if (CountM >= 2) {
+        const int8_t* a0 = reinterpret_cast<const int8_t*>(A);
+        const int8_t* a1 = a0 + lda;
+
+        size_t k = CountK;
+        int32x2_t RowSums = vmov_n_s32(0);
+
+        while (k >= 16) {
+            int64x2_t v0 = vld1q_s64(reinterpret_cast<const int64_t*>(a0));
+            a0 += 16;
+            int64x2_t v1 = vld1q_s64(reinterpret_cast<const int64_t*>(a1));
+            a1 += 16;
+
+            int64x2_t z0 = vzip1q_s64(v0, v1);
+            int64x2_t z1 = vzip2q_s64(v0, v1);
+
+            vst1q_s8(&D[0], vreinterpretq_s8_s64(z0));
+            vst1q_s8(&D[16], vreinterpretq_s8_s64(z1));
+
+            int32x4_t RowSumsL_pada = vmovq_n_s32(0);
+            RowSumsL_pada = vpadalq_s16(RowSumsL_pada, vpaddlq_s8(vreinterpretq_s8_s64(z0)));
+            RowSumsL_pada = vpadalq_s16(RowSumsL_pada, vpaddlq_s8(vreinterpretq_s8_s64(z1)));
+
+            int32x4_t RowSumsL_ext = vextq_s32(RowSumsL_pada, RowSumsL_pada, 1);
+            int32x4_t RowSumsL_add = vaddq_s32(RowSumsL_pada, RowSumsL_ext);
+            int32x2_t RowSumsL = {vdups_laneq_s32(RowSumsL_add, 0),
+                                  vdups_laneq_s32(RowSumsL_add, 2)};
+
+            RowSums = vadd_s32(RowSums, RowSumsL);
+
+            D += 32;
+            k -= 16;
+        }
+
+        while (k >= 8) {
+            int64x1_t v0 = *reinterpret_cast<const int64x1_t*>(a0);
+            a0 += 8;
+            int64x1_t v1 = *reinterpret_cast<const int64x1_t*>(a1);
+            a1 += 8;
+
+            *reinterpret_cast<int64x1_t*>(&D[0]) = v0;
+            *reinterpret_cast<int64x1_t*>(&D[8]) = v1;
+
+            int64x2_t z01 = vcombine_s64(v0, v1);
+            int32x4_t RowSumsL_pada = vmovq_n_s32(0);
+            RowSumsL_pada = vpadalq_s16(RowSumsL_pada, vpaddlq_s8(vreinterpretq_s8_s64(z01)));
+
+            int32x4_t RowSumsL_ext = vextq_s32(RowSumsL_pada, RowSumsL_pada, 1);
+            int32x4_t RowSumsL_add = vaddq_s32(RowSumsL_pada, RowSumsL_ext);
+            int32x2_t RowSumsL = {vdups_laneq_s32(RowSumsL_add, 0),
+                                  vdups_laneq_s32(RowSumsL_add, 2)};
+
+            RowSums = vadd_s32(RowSums, RowSumsL);
+
+            D += 16;
+            k -= 8;
+        }
+
+        if (k > 0) {
+            //
+            // Zero pad the remaining elements to make 8 columns.
+            //
+
+            int8_t* d = PaddedMatrixAData;
+            vst1q_s8(PaddedMatrixAData, vmovq_n_s8(0));
+
+            while (k > 0) {
+                d[0] = *a0++;
+                d[8] = *a1++;
+
+                d += 1;
+                k -= 1;
+            }
+
+            d = PaddedMatrixAData;
+            int64x1_t v0 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+            int64x1_t v1 = *reinterpret_cast<const int64x1_t*>(d);
+            d = d + 8;
+
+            int64x2_t z01 = vcombine_s64(v0, v1);
+            int32x4_t RowSumsL_pada = vmovq_n_s32(0);
+            RowSumsL_pada = vpadalq_s16(RowSumsL_pada, vpaddlq_s8(vreinterpretq_s8_s64(z01)));
+
+            int32x4_t RowSumsL_ext = vextq_s32(RowSumsL_pada, RowSumsL_pada, 1);
+            int32x4_t RowSumsL_add = vaddq_s32(RowSumsL_pada, RowSumsL_ext);
+            int32x2_t RowSumsL = {vdups_laneq_s32(RowSumsL_add, 0),
+                                  vdups_laneq_s32(RowSumsL_add, 2)};
+
+            RowSums = vadd_s32(RowSums, RowSumsL);
+
+            int8x16_t PackedVector = vld1q_s8(PaddedMatrixAData);
+            vst1q_s8(D, PackedVector);
+
+            D += 16;
+        }
+
+        vst1_s32(RowSumBuffer, RowSums);
+        RowSumBuffer += 2;
+
+        A = A + lda * 2;
+        CountM -= 2;
+    }
+
+    //
+    // Process one row of matrix A.
+    //
+    // The buffer is packed as a series of 8 byte with the following pattern:
+    //
+    //      [ A0 A1 A2 A3 A4 A5 A6 A7 ]
+    //
+    // This pattern is repeated (CountK / 8) times.
+    //
+    // If CountK is not aligned to a multiple of 8, then the vector is padded
+    // with zeroes.
+    //
+
+    if (CountM > 0) {
+        // No need to pad the rows to 2, the .S takes care of zero pdding
+        const int8_t* a = reinterpret_cast<const int8_t*>(A);
+        size_t k = CountK;
+        int32x4_t RowSums = vmovq_n_s32(0);
+
+        while (k >= 16) {
+            int8x16_t v = vld1q_s8(a);
+            a += 16;
+
+            vst1q_s8(D, v);
+
+            RowSums = vpadalq_s16(RowSums, vpaddlq_s8(v));
+
+            D += 16;
+            k -= 16;
+        }
+
+        if (k > 0) {
+            //
+            // Copy the remaining bytes to the zero padded stack buffer.
+            //
+
+            vst1q_s8(PaddedMatrixAData, vmovq_n_s8(0));
+
+            for (size_t kk = 0; kk < k; kk++) {
+                PaddedMatrixAData[kk] = a[kk];
+            }
+
+            int8x16_t v = vld1q_s8(PaddedMatrixAData);
+            vst1q_s8(D, v);
+
+            RowSums = vpadalq_s16(RowSums, vpaddlq_s8(v));
+        }
+
+        *RowSumBuffer = int32_t(vaddvq_s32(RowSums));
+    }
+}
+
+MLAS_FORCEINLINE
+void
+MlasGemmS8S8CopyPackBProcessSmmla(int8_t* D, int8x8_t BytesRow[8], int32x4_t ColumnSums[2])
+{
+    int8x16_t v02 = vcombine_s8(BytesRow[0], BytesRow[2]);
+    int8x16_t v13 = vcombine_s8(BytesRow[1], BytesRow[3]);
+
+    int8x16_t v46 = vcombine_s8(BytesRow[4], BytesRow[6]);
+    int8x16_t v57 = vcombine_s8(BytesRow[5], BytesRow[7]);
+
+    int8x16x2_t zw1 = vzipq_s8(v02, v13);
+    int16x8x2_t zd1 = vzipq_s16(vreinterpretq_s16_s8(zw1.val[0]), vreinterpretq_s16_s8(zw1.val[1]));
+
+    int8x16x2_t zw2 = vzipq_s8(v46, v57);
+    int16x8x2_t zd2 = vzipq_s16(vreinterpretq_s16_s8(zw2.val[0]), vreinterpretq_s16_s8(zw2.val[1]));
+
+    int32x4x2_t zd3 =
+        vzipq_s32(vreinterpretq_s32_s16(zd1.val[0]), vreinterpretq_s32_s16(zd2.val[0]));
+    int32x4x2_t zd4 =
+        vzipq_s32(vreinterpretq_s32_s16(zd1.val[1]), vreinterpretq_s32_s16(zd2.val[1]));
+
+    vst1q_s8(&D[0], vreinterpretq_s8_s32(zd3.val[0]));
+    vst1q_s8(&D[16], vreinterpretq_s8_s32(zd3.val[1]));
+    vst1q_s8(&D[32], vreinterpretq_s8_s32(zd4.val[0]));
+    vst1q_s8(&D[48], vreinterpretq_s8_s32(zd4.val[1]));
+
+    int32x4_t ColSums0L_pada = vmovq_n_s32(0);
+    ColSums0L_pada = vpadalq_s16(ColSums0L_pada, vpaddlq_s8(vreinterpretq_s8_s32(zd3.val[0])));
+    int32x4_t ColSums0L_ext = vextq_s32(ColSums0L_pada, ColSums0L_pada, 1);
+    int32x4_t ColSums0L_add = vaddq_s32(ColSums0L_pada, ColSums0L_ext);
+    int32x2_t ColSums0L = {vdups_laneq_s32(ColSums0L_add, 0), vdups_laneq_s32(ColSums0L_add, 2)};
+
+    int32x4_t ColSums0H_pada = vmovq_n_s32(0);
+    ColSums0H_pada = vpadalq_s16(ColSums0H_pada, vpaddlq_s8(vreinterpretq_s8_s32(zd3.val[1])));
+    int32x4_t ColSums0H_ext = vextq_s32(ColSums0H_pada, ColSums0H_pada, 1);
+    int32x4_t ColSums0H_add = vaddq_s32(ColSums0H_pada, ColSums0H_ext);
+    int32x2_t ColSums0H = {vdups_laneq_s32(ColSums0H_add, 0), vdups_laneq_s32(ColSums0H_add, 2)};
+
+    ColumnSums[0] = vaddq_s32(ColumnSums[0], vcombine_s32(ColSums0L, ColSums0H));
+
+    int32x4_t ColSums1L_pada = vmovq_n_s32(0);
+    ColSums1L_pada = vpadalq_s16(ColSums1L_pada, vpaddlq_s8(vreinterpretq_s8_s32(zd4.val[0])));
+    int32x4_t ColSums1L_ext = vextq_s32(ColSums1L_pada, ColSums1L_pada, 1);
+    int32x4_t ColSums1L_add = vaddq_s32(ColSums1L_pada, ColSums1L_ext);
+    int32x2_t ColSums1L = {vdups_laneq_s32(ColSums1L_add, 0), vdups_laneq_s32(ColSums1L_add, 2)};
+
+    int32x4_t ColSums1H_pada = vmovq_n_s32(0);
+    ColSums1H_pada = vpadalq_s16(ColSums1H_pada, vpaddlq_s8(vreinterpretq_s8_s32(zd4.val[1])));
+    int32x4_t ColSums1H_ext = vextq_s32(ColSums1H_pada, ColSums1H_pada, 1);
+    int32x4_t ColSums1H_add = vaddq_s32(ColSums1H_pada, ColSums1H_ext);
+    int32x2_t ColSums1H = {vdups_laneq_s32(ColSums1H_add, 0), vdups_laneq_s32(ColSums1H_add, 2)};
+
+    ColumnSums[1] = vaddq_s32(ColumnSums[1], vcombine_s32(ColSums1L, ColSums1H));
+}
+
+template <>
+void
+MlasGemmQuantCopyPackB<MLAS_GEMM_S8S8_KERNEL_SMMLA>(MLAS_GEMM_S8S8_KERNEL_SMMLA::PackedBType* Dst,
+                                                    const uint8_t* B,
+                                                    size_t ldb,
+                                                    size_t CountN,
+                                                    size_t CountK,
+                                                    int32_t* ColumnSumBuffer,
+                                                    bool BIsSigned)
+{
+    MLAS_UNREFERENCED_PARAMETER(BIsSigned);
+    int8_t* D = reinterpret_cast<int8_t*>(Dst);
+    const int8x16_t ZeroVector = vmovq_n_s8(0);
+    int8x8_t BytesRow[8];
+
+    //
+    // Copy data from matrix B into the destination buffer 8x2 blocks at a
+    // time.
+    //
+    //
+    while (CountN >= 8) {
+        const int8_t* b = reinterpret_cast<const int8_t*>(B);
+        size_t k = CountK;
+        int32x4_t ColumnSums[2];
+
+        ColumnSums[0] = vmovq_n_s32(0);
+        ColumnSums[1] = vmovq_n_s32(0);
+
+        while (k >= 8) {
+            BytesRow[0] = vld1_s8(&b[ldb * 0]);
+            BytesRow[1] = vld1_s8(&b[ldb * 1]);
+            BytesRow[2] = vld1_s8(&b[ldb * 2]);
+            BytesRow[3] = vld1_s8(&b[ldb * 3]);
+            BytesRow[4] = vld1_s8(&b[ldb * 4]);
+            BytesRow[5] = vld1_s8(&b[ldb * 5]);
+            BytesRow[6] = vld1_s8(&b[ldb * 6]);
+            BytesRow[7] = vld1_s8(&b[ldb * 7]);
+
+            MlasGemmS8S8CopyPackBProcessSmmla(D, BytesRow, ColumnSums);
+
+            D += 64;
+            b += ldb * 8;
+            k -= 8;
+        }
+
+        if (k > 0) {
+            // Pad k to 8
+
+            BytesRow[0] = vld1_s8(&b[ldb * 0]);
+            BytesRow[1] = (k >= 2) ? vld1_s8(&b[ldb * 1]) : vget_low_s8(ZeroVector);
+            BytesRow[2] = (k >= 3) ? vld1_s8(&b[ldb * 2]) : vget_low_s8(ZeroVector);
+            BytesRow[3] = (k >= 4) ? vld1_s8(&b[ldb * 3]) : vget_low_s8(ZeroVector);
+            BytesRow[4] = (k >= 5) ? vld1_s8(&b[ldb * 4]) : vget_low_s8(ZeroVector);
+            BytesRow[5] = (k >= 6) ? vld1_s8(&b[ldb * 5]) : vget_low_s8(ZeroVector);
+            BytesRow[6] = (k >= 7) ? vld1_s8(&b[ldb * 6]) : vget_low_s8(ZeroVector);
+            BytesRow[7] = vget_low_s8(ZeroVector);
+
+            MlasGemmS8S8CopyPackBProcessSmmla(D, BytesRow, ColumnSums);
+
+            D += 64;
+        }
+
+        // Zero pad the output buffer to a multiple of PackedK if the above
+        // processed an odd number of four row bundles.
+        //
+        vst1q_s32(&ColumnSumBuffer[0], ColumnSums[0]);
+        vst1q_s32(&ColumnSumBuffer[4], ColumnSums[1]);
+
+        ColumnSumBuffer += 8;
+
+        B += 8;
+        CountN -= 8;
+    }
+
+    //
+    // Process the remaining columns of matrix B.
+    //
+
+    if (CountN > 0) {
+        const int8_t* b = reinterpret_cast<const int8_t*>(B);
+        size_t k = CountK;
+        int8_t PaddedMatrixBData[64];
+        int32x4_t ColumnSums[2];
+
+        vst1q_s8(&PaddedMatrixBData[0], ZeroVector);
+        vst1q_s8(&PaddedMatrixBData[16], ZeroVector);
+        vst1q_s8(&PaddedMatrixBData[32], ZeroVector);
+        vst1q_s8(&PaddedMatrixBData[48], ZeroVector);
+
+        ColumnSums[0] = vmovq_n_s32(0);
+        ColumnSums[1] = vmovq_n_s32(0);
+
+        //
+        // Interleave rows of matrix B using an intermediate zero padded stack
+        // buffer and write to the packed buffer.
+        //
+
+        while (k > 0) {
+            const int8_t* bcopy0 = &b[ldb * 0];
+            const int8_t* bcopy1 = &b[ldb * 1];
+            const int8_t* bcopy2 = &b[ldb * 2];
+            const int8_t* bcopy3 = &b[ldb * 3];
+            const int8_t* bcopy4 = &b[ldb * 4];
+            const int8_t* bcopy5 = &b[ldb * 5];
+            const int8_t* bcopy6 = &b[ldb * 6];
+            const int8_t* bcopy7 = &b[ldb * 7];
+
+            if (k >= 8) {
+                b += ldb * 8;
+                k -= 8;
+
+            } else {
+                vst1q_s8(&PaddedMatrixBData[0], ZeroVector);
+                vst1q_s8(&PaddedMatrixBData[16], ZeroVector);
+                vst1q_s8(&PaddedMatrixBData[32], ZeroVector);
+                vst1q_s8(&PaddedMatrixBData[48], ZeroVector);
+
+                bcopy1 = (k >= 2) ? bcopy1 : &PaddedMatrixBData[56];
+                bcopy2 = (k >= 3) ? bcopy2 : &PaddedMatrixBData[56];
+                bcopy3 = (k >= 4) ? bcopy3 : &PaddedMatrixBData[56];
+                bcopy4 = (k >= 5) ? bcopy4 : &PaddedMatrixBData[56];
+                bcopy5 = (k >= 6) ? bcopy5 : &PaddedMatrixBData[56];
+                bcopy6 = (k >= 7) ? bcopy6 : &PaddedMatrixBData[56];
+                bcopy7 = &PaddedMatrixBData[56];
+
+                k = 0;
+            }
+
+            int8_t* padded = PaddedMatrixBData;
+            int8_t* padded_end = padded + CountN;
+            do {
+                padded[0] = *bcopy0++;
+                padded[8] = *bcopy1++;
+                padded[16] = *bcopy2++;
+                padded[24] = *bcopy3++;
+                padded[32] = *bcopy4++;
+                padded[40] = *bcopy5++;
+                padded[48] = *bcopy6++;
+                padded[56] = *bcopy7++;
+
+            } while (++padded < padded_end);
+
+            BytesRow[0] = vld1_s8(&PaddedMatrixBData[0]);
+            BytesRow[1] = vld1_s8(&PaddedMatrixBData[8]);
+            BytesRow[2] = vld1_s8(&PaddedMatrixBData[16]);
+            BytesRow[3] = vld1_s8(&PaddedMatrixBData[24]);
+            BytesRow[4] = vld1_s8(&PaddedMatrixBData[32]);
+            BytesRow[5] = vld1_s8(&PaddedMatrixBData[40]);
+            BytesRow[6] = vld1_s8(&PaddedMatrixBData[48]);
+            BytesRow[7] = vld1_s8(&PaddedMatrixBData[56]);
+
+            MlasGemmS8S8CopyPackBProcessSmmla(D, BytesRow, ColumnSums);
+
+            D += 64;
+        }
+
+        vst1q_s32(&ColumnSumBuffer[0], ColumnSums[0]);
+        vst1q_s32(&ColumnSumBuffer[4], ColumnSums[1]);
+    }
+}
+
+template <>
+MLAS_FORCEINLINE size_t
+MlasGemmQuantKernel<MLAS_GEMM_S8S8_KERNEL_SMMLA>(const MLAS_GEMM_S8S8_KERNEL_SMMLA::PackedAType* A,
+                                                 const MLAS_GEMM_S8S8_KERNEL_SMMLA::PackedBType* B,
+                                                 int32_t* C,
+                                                 size_t PackedCountK,
+                                                 size_t CountM,
+                                                 size_t CountN,
+                                                 size_t ldc,
+                                                 const int32_t* RowSumBuffer,
+                                                 const int32_t* ColumnSumBuffer,
+                                                 const int32_t* ZeroPointB,
+                                                 bool ZeroMode)
+{
+    size_t RowsHandled;
+
+    if (ZeroMode) {
+        RowsHandled = MlasGemmS8S8KernelSmmlaZero(A, B, C, PackedCountK, CountM, CountN, ldc,
+                                                  RowSumBuffer, ColumnSumBuffer, ZeroPointB);
+    } else {
+        RowsHandled = MlasGemmS8S8KernelSmmlaAdd(A, B, C, PackedCountK, CountM, CountN, ldc,
+                                                 RowSumBuffer, ColumnSumBuffer, ZeroPointB);
+    }
+
+    return RowsHandled;
+}
+
+const MLAS_GEMM_QUANT_DISPATCH MlasGemmS8S8DispatchSmmla = {
+    MlasGemmQuantOperation<MLAS_GEMM_S8S8_KERNEL_SMMLA>,
+    MlasGemmQuantPackedOperation<MLAS_GEMM_S8S8_KERNEL_SMMLA>,
+    MlasGemmQuantCopyPackB<MLAS_GEMM_S8S8_KERNEL_SMMLA>,
+    MLAS_GEMM_S8S8_KERNEL_SMMLA::PackedK,
+    MLAS_GEMM_S8S8_KERNEL_SMMLA::PackedStrides.K,
+    8};
diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_ummla.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_ummla.cpp
new file mode 100644
index 0000000000000..3936154432ac7
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_ummla.cpp
@@ -0,0 +1,967 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    qgemm_kernel_ummla.cpp
+
+Abstract:
+
+    This module implements ummla QGEMM kernel.
+
+--*/
+
+#include "mlasi.h"
+#include "qgemm.h"
+
+//
+// Define the prototypes of the NEON UMMLA routines written in assembly.
+//
+
+extern "C" {
+
+size_t MLASCALL
+MlasGemmU8X8KernelUmmlaZero(const uint8_t* A,
+                            const uint8_t* B,
+                            int32_t* C,
+                            size_t PackedCountK,
+                            size_t CountM,
+                            size_t CountN,
+                            size_t ldc,
+                            const int32_t* RowSumVector,
+                            const int32_t* ColumnSumVector,
+                            const int32_t* ZeroPointB);
+
+size_t MLASCALL
+MlasGemmU8X8KernelUmmlaAdd(const uint8_t* A,
+                           const uint8_t* B,
+                           int32_t* C,
+                           size_t PackedCountK,
+                           size_t CountM,
+                           size_t CountN,
+                           size_t ldc,
+                           const int32_t* RowSumVector,
+                           const int32_t* ColumnSumVector,
+                           const int32_t* ZeroPointB);
+}
+
+struct MLAS_GEMM_U8X8_KERNEL_UMMLA {
+    typedef uint8_t PackedAType;
+    typedef uint8_t PackedBType;
+    typedef uint8_t OffsetAType;
+    typedef uint8_t OffsetBType;
+
+    static constexpr size_t PackedK = 8;
+    static constexpr MLAS_GEMM_QUANT_STRIDES Strides{24, 128, 256};
+    static constexpr MLAS_GEMM_QUANT_STRIDES PackedStrides{24, 128, 384};
+};
+
+constexpr size_t MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedK;
+constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_U8X8_KERNEL_UMMLA::Strides;
+constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedStrides;
+
+template <>
+MLAS_FORCEINLINE int32_t
+MlasGemmQuantFixupZeroPointB<MLAS_GEMM_U8X8_KERNEL_UMMLA>(int32_t ZeroPointB, bool BIsSigned)
+{
+    if (BIsSigned) {
+        ZeroPointB = MLAS_GEMM_U8X8_KERNEL_UMMLA::OffsetBType(ZeroPointB ^ 0x80);
+    }
+
+    return ZeroPointB;
+}
+
+template <>
+void
+MlasGemmQuantCopyPackA<MLAS_GEMM_U8X8_KERNEL_UMMLA>(MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedAType* D,
+                                                    const uint8_t* A,
+                                                    size_t lda,
+                                                    size_t CountM,
+                                                    size_t CountK,
+                                                    int32_t* RowSumBuffer,
+                                                    bool AIsSigned)
+{
+    MLAS_UNREFERENCED_PARAMETER(AIsSigned);
+    uint8_t PaddedMatrixAData[64];
+
+    //
+    // Process 8 rows of matrix A.
+    //
+    // MMLA kernels load 8x8 block of A with four vector registers. So A is packed
+    // a series of 64 byte vectors where eight rows are interleaved with the
+    // following pattern:
+    //
+    //      [ A0 A1 A2 A3 A4 A5 A6 A7 ]
+    //      [ B0 B1 B2 B3 B4 B5 B6 B7 ]
+    //      [ C0 C1 C2 C3 C4 C5 C6 C7 ]
+    //      [ D0 D1 D2 D3 D4 D5 D6 D7 ]
+    //      [ E0 E1 E2 E3 E4 E5 E6 E7 ]
+    //      [ F0 F1 F2 F3 F4 F5 F6 F7 ]
+    //      [ G0 G1 G2 G3 G4 G5 G6 G7 ]
+    //      [ H0 H1 H2 H3 H4 H5 H6 H7 ]
+    //
+    //      ...
+    //
+    // This pattern is repeated (CountK / 8) times.
+    //
+    // If CountK is not aligned to a multiple of eight, then the vector is padded
+    // with zeroes.
+    //
+
+    while (CountM >= 8) {
+        const uint8_t* a0 = A;
+        const uint8_t* a1 = a0 + lda;
+        const uint8_t* a2 = a0 + lda * 2;
+        const uint8_t* a3 = a0 + lda * 3;
+        const uint8_t* a4 = a0 + lda * 4;
+        const uint8_t* a5 = a0 + lda * 5;
+        const uint8_t* a6 = a0 + lda * 6;
+        const uint8_t* a7 = a0 + lda * 7;
+
+        size_t k = CountK;
+        uint32x4_t RowSums0 = vmovq_n_u32(0);
+        uint32x4_t RowSums1 = vmovq_n_u32(0);
+
+        while (k >= 16) {
+            uint64x2_t v0 = vld1q_u64(reinterpret_cast<const uint64_t*>(a0));
+            a0 += 16;
+            uint64x2_t v1 = vld1q_u64(reinterpret_cast<const uint64_t*>(a1));
+            a1 += 16;
+            uint64x2_t v2 = vld1q_u64(reinterpret_cast<const uint64_t*>(a2));
+            a2 += 16;
+            uint64x2_t v3 = vld1q_u64(reinterpret_cast<const uint64_t*>(a3));
+            a3 += 16;
+            uint64x2_t v4 = vld1q_u64(reinterpret_cast<const uint64_t*>(a4));
+            a4 += 16;
+            uint64x2_t v5 = vld1q_u64(reinterpret_cast<const uint64_t*>(a5));
+            a5 += 16;
+            uint64x2_t v6 = vld1q_u64(reinterpret_cast<const uint64_t*>(a6));
+            a6 += 16;
+            uint64x2_t v7 = vld1q_u64(reinterpret_cast<const uint64_t*>(a7));
+            a7 += 16;
+
+            uint64x2_t z0 = vzip1q_u64(v0, v1);
+            uint64x2_t z1 = vzip2q_u64(v0, v1);
+            uint64x2_t z2 = vzip1q_u64(v2, v3);
+            uint64x2_t z3 = vzip2q_u64(v2, v3);
+
+            uint64x2_t z4 = vzip1q_u64(v4, v5);
+            uint64x2_t z5 = vzip2q_u64(v4, v5);
+            uint64x2_t z6 = vzip1q_u64(v6, v7);
+            uint64x2_t z7 = vzip2q_u64(v6, v7);
+
+            vst1q_u8(&D[0], vreinterpretq_u8_u64(z0));
+            vst1q_u8(&D[16], vreinterpretq_u8_u64(z2));
+            vst1q_u8(&D[32], vreinterpretq_u8_u64(z4));
+            vst1q_u8(&D[48], vreinterpretq_u8_u64(z6));
+            vst1q_u8(&D[64], vreinterpretq_u8_u64(z1));
+            vst1q_u8(&D[80], vreinterpretq_u8_u64(z3));
+            vst1q_u8(&D[96], vreinterpretq_u8_u64(z5));
+            vst1q_u8(&D[112], vreinterpretq_u8_u64(z7));
+
+            uint32x4_t RowSums0L_pada = vmovq_n_u32(0);
+            RowSums0L_pada = vpadalq_u16(RowSums0L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z0)));
+            RowSums0L_pada = vpadalq_u16(RowSums0L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z1)));
+
+            uint32x4_t RowSums0L_ext = vextq_u32(RowSums0L_pada, RowSums0L_pada, 1);
+            uint32x4_t RowSums0L_add = vaddq_u32(RowSums0L_pada, RowSums0L_ext);
+            uint32x2_t RowSums0L = {vdups_laneq_u32(RowSums0L_add, 0),
+                                    vdups_laneq_u32(RowSums0L_add, 2)};
+
+            uint32x4_t RowSums0H_pada = vmovq_n_u32(0);
+            RowSums0H_pada = vpadalq_u16(RowSums0H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z2)));
+            RowSums0H_pada = vpadalq_u16(RowSums0H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z3)));
+
+            uint32x4_t RowSums0H_ext = vextq_u32(RowSums0H_pada, RowSums0H_pada, 1);
+            uint32x4_t RowSums0H_add = vaddq_u32(RowSums0H_pada, RowSums0H_ext);
+            uint32x2_t RowSums0H = {vdups_laneq_u32(RowSums0H_add, 0),
+                                    vdups_laneq_u32(RowSums0H_add, 2)};
+
+            RowSums0 = vaddq_u32(RowSums0, vcombine_u32(RowSums0L, RowSums0H));
+
+            uint32x4_t RowSums1L_pada = vmovq_n_u32(0);
+            RowSums1L_pada = vpadalq_u16(RowSums1L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z4)));
+            RowSums1L_pada = vpadalq_u16(RowSums1L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z5)));
+
+            uint32x4_t RowSums1L_ext = vextq_u32(RowSums1L_pada, RowSums1L_pada, 1);
+            uint32x4_t RowSums1L_add = vaddq_u32(RowSums1L_pada, RowSums1L_ext);
+            uint32x2_t RowSums1L = {vdups_laneq_u32(RowSums1L_add, 0),
+                                    vdups_laneq_u32(RowSums1L_add, 2)};
+
+            uint32x4_t RowSums1H_pada = vmovq_n_u32(0);
+            RowSums1H_pada = vpadalq_u16(RowSums1H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z6)));
+            RowSums1H_pada = vpadalq_u16(RowSums1H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z7)));
+
+            uint32x4_t RowSums1H_ext = vextq_u32(RowSums1H_pada, RowSums1H_pada, 1);
+            uint32x4_t RowSums1H_add = vaddq_u32(RowSums1H_pada, RowSums1H_ext);
+            uint32x2_t RowSums1H = {vdups_laneq_u32(RowSums1H_add, 0),
+                                    vdups_laneq_u32(RowSums1H_add, 2)};
+
+            RowSums1 = vaddq_u32(RowSums1, vcombine_u32(RowSums1L, RowSums1H));
+
+            D += 128;
+            k -= 16;
+        }
+
+        while (k >= 8) {
+            uint64x1_t v0 = *reinterpret_cast<const uint64x1_t*>(a0);
+            a0 += 8;
+            uint64x1_t v1 = *reinterpret_cast<const uint64x1_t*>(a1);
+            a1 += 8;
+            uint64x1_t v2 = *reinterpret_cast<const uint64x1_t*>(a2);
+            a2 += 8;
+            uint64x1_t v3 = *reinterpret_cast<const uint64x1_t*>(a3);
+            a3 += 8;
+            uint64x1_t v4 = *reinterpret_cast<const uint64x1_t*>(a4);
+            a4 += 8;
+            uint64x1_t v5 = *reinterpret_cast<const uint64x1_t*>(a5);
+            a5 += 8;
+            uint64x1_t v6 = *reinterpret_cast<const uint64x1_t*>(a6);
+            a6 += 8;
+            uint64x1_t v7 = *reinterpret_cast<const uint64x1_t*>(a7);
+            a7 += 8;
+
+            *reinterpret_cast<uint64x1_t*>(&D[0]) = v0;
+            *reinterpret_cast<uint64x1_t*>(&D[8]) = v1;
+            *reinterpret_cast<uint64x1_t*>(&D[16]) = v2;
+            *reinterpret_cast<uint64x1_t*>(&D[24]) = v3;
+            *reinterpret_cast<uint64x1_t*>(&D[32]) = v4;
+            *reinterpret_cast<uint64x1_t*>(&D[40]) = v5;
+            *reinterpret_cast<uint64x1_t*>(&D[48]) = v6;
+            *reinterpret_cast<uint64x1_t*>(&D[56]) = v7;
+
+            uint64x2_t z01 = vcombine_u64(v0, v1);
+            uint64x2_t z23 = vcombine_u64(v2, v3);
+            uint64x2_t z45 = vcombine_u64(v4, v5);
+            uint64x2_t z67 = vcombine_u64(v6, v7);
+
+            uint32x4_t RowSums0L_pada = vmovq_n_u32(0);
+            RowSums0L_pada = vpadalq_u16(RowSums0L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z01)));
+
+            uint32x4_t RowSums0L_ext = vextq_u32(RowSums0L_pada, RowSums0L_pada, 1);
+            uint32x4_t RowSums0L_add = vaddq_u32(RowSums0L_pada, RowSums0L_ext);
+            uint32x2_t RowSums0L = {vdups_laneq_u32(RowSums0L_add, 0),
+                                    vdups_laneq_u32(RowSums0L_add, 2)};
+
+            uint32x4_t RowSums0H_pada = vmovq_n_u32(0);
+            RowSums0H_pada = vpadalq_u16(RowSums0H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z23)));
+
+            uint32x4_t RowSums0H_ext = vextq_u32(RowSums0H_pada, RowSums0H_pada, 1);
+            uint32x4_t RowSums0H_add = vaddq_u32(RowSums0H_pada, RowSums0H_ext);
+            uint32x2_t RowSums0H = {vdups_laneq_u32(RowSums0H_add, 0),
+                                    vdups_laneq_u32(RowSums0H_add, 2)};
+
+            RowSums0 = vaddq_u32(RowSums0, vcombine_u32(RowSums0L, RowSums0H));
+
+            uint32x4_t RowSums1L_pada = vmovq_n_u32(0);
+            RowSums1L_pada = vpadalq_u16(RowSums1L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z45)));
+
+            uint32x4_t RowSums1L_ext = vextq_u32(RowSums1L_pada, RowSums1L_pada, 1);
+            uint32x4_t RowSums1L_add = vaddq_u32(RowSums1L_pada, RowSums1L_ext);
+            uint32x2_t RowSums1L = {vdups_laneq_u32(RowSums1L_add, 0),
+                                    vdups_laneq_u32(RowSums1L_add, 2)};
+
+            uint32x4_t RowSums1H_pada = vmovq_n_u32(0);
+            RowSums1H_pada = vpadalq_u16(RowSums1H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z67)));
+
+            uint32x4_t RowSums1H_ext = vextq_u32(RowSums1H_pada, RowSums1H_pada, 1);
+            uint32x4_t RowSums1H_add = vaddq_u32(RowSums1H_pada, RowSums1H_ext);
+            uint32x2_t RowSums1H = {vdups_laneq_u32(RowSums1H_add, 0),
+                                    vdups_laneq_u32(RowSums1H_add, 2)};
+
+            RowSums1 = vaddq_u32(RowSums1, vcombine_u32(RowSums1L, RowSums1H));
+
+            D += 64;
+            k -= 8;
+        }
+
+        if (k > 0) {
+            //
+            // zero pad the remaining columns to 8
+            //
+            uint8_t* d = D;
+
+            vst1q_u8(d, vmovq_n_u8(0));
+            vst1q_u8(&d[16], vmovq_n_u8(0));
+            vst1q_u8(&d[32], vmovq_n_u8(0));
+            vst1q_u8(&d[48], vmovq_n_u8(0));
+
+            while (k > 0) {
+                d[0] = *a0++;
+                d[8] = *a1++;
+                d[16] = *a2++;
+                d[24] = *a3++;
+                d[32] = *a4++;
+                d[40] = *a5++;
+                d[48] = *a6++;
+                d[56] = *a7++;
+                d += 1;
+                k -= 1;
+            }
+            d = D;
+            uint64x1_t v0 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v1 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v2 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v3 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v4 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v5 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v6 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v7 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+
+            uint64x2_t z01 = vcombine_u64(v0, v1);
+            uint64x2_t z23 = vcombine_u64(v2, v3);
+            uint64x2_t z45 = vcombine_u64(v4, v5);
+            uint64x2_t z67 = vcombine_u64(v6, v7);
+
+            uint32x4_t RowSums0L_pada = vmovq_n_u32(0);
+            RowSums0L_pada = vpadalq_u16(RowSums0L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z01)));
+
+            uint32x4_t RowSums0L_ext = vextq_u32(RowSums0L_pada, RowSums0L_pada, 1);
+            uint32x4_t RowSums0L_add = vaddq_u32(RowSums0L_pada, RowSums0L_ext);
+            uint32x2_t RowSums0L = {vdups_laneq_u32(RowSums0L_add, 0),
+                                    vdups_laneq_u32(RowSums0L_add, 2)};
+
+            uint32x4_t RowSums0H_pada = vmovq_n_u32(0);
+            RowSums0H_pada = vpadalq_u16(RowSums0H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z23)));
+
+            uint32x4_t RowSums0H_ext = vextq_u32(RowSums0H_pada, RowSums0H_pada, 1);
+            uint32x4_t RowSums0H_add = vaddq_u32(RowSums0H_pada, RowSums0H_ext);
+            uint32x2_t RowSums0H = {vdups_laneq_u32(RowSums0H_add, 0),
+                                    vdups_laneq_u32(RowSums0H_add, 2)};
+
+            RowSums0 = vaddq_u32(RowSums0, vcombine_u32(RowSums0L, RowSums0H));
+
+            uint32x4_t RowSums1L_pada = vmovq_n_u32(0);
+            RowSums1L_pada = vpadalq_u16(RowSums1L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z45)));
+
+            uint32x4_t RowSums1L_ext = vextq_u32(RowSums1L_pada, RowSums1L_pada, 1);
+            uint32x4_t RowSums1L_add = vaddq_u32(RowSums1L_pada, RowSums1L_ext);
+            uint32x2_t RowSums1L = {vdups_laneq_u32(RowSums1L_add, 0),
+                                    vdups_laneq_u32(RowSums1L_add, 2)};
+
+            uint32x4_t RowSums1H_pada = vmovq_n_u32(0);
+            RowSums1H_pada = vpadalq_u16(RowSums1H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z67)));
+
+            uint32x4_t RowSums1H_ext = vextq_u32(RowSums1H_pada, RowSums1H_pada, 1);
+            uint32x4_t RowSums1H_add = vaddq_u32(RowSums1H_pada, RowSums1H_ext);
+            uint32x2_t RowSums1H = {vdups_laneq_u32(RowSums1H_add, 0),
+                                    vdups_laneq_u32(RowSums1H_add, 2)};
+
+            RowSums1 = vaddq_u32(RowSums1, vcombine_u32(RowSums1L, RowSums1H));
+
+            D += 64;
+        }
+
+        vst1q_s32(RowSumBuffer, vreinterpretq_s32_u32(RowSums0));
+        vst1q_s32(&RowSumBuffer[4], vreinterpretq_s32_u32(RowSums1));
+
+        RowSumBuffer += 8;
+
+        A = A + lda * 8;
+        CountM -= 8;
+    }
+
+    //
+    // Process four rows of matrix A.
+    //
+    // The buffer is packed as a series of 32 byte vectors where four rows are
+    // interleaved with the following pattern:
+    //
+    //      [ A0 A1 A2 A3 A4 A5 A6 A7 ]
+    //      [ B0 B1 B2 B3 B4 B5 B6 B7 ]
+    //      [ C0 C1 C2 C3 C4 C5 C6 C7 ]
+    //      [ D0 D1 D2 D3 D4 D5 D6 D7 ]
+    //
+    // This pattern is repeated (CountK / 8) times.
+    //
+    // If CountK is not aligned to a multiple of eight, then the vector is padded
+    // with zeroes.
+    //
+
+    if (CountM >= 4) {
+        const uint8_t* a0 = A;
+        const uint8_t* a1 = a0 + lda;
+        const uint8_t* a2 = a1 + lda;
+        const uint8_t* a3 = a2 + lda;
+
+        size_t k = CountK;
+        uint32x4_t RowSums = vmovq_n_u32(0);
+
+        while (k >= 16) {
+            uint64x2_t v0 = vld1q_u64(reinterpret_cast<const uint64_t*>(a0));
+            a0 += 16;
+            uint64x2_t v1 = vld1q_u64(reinterpret_cast<const uint64_t*>(a1));
+            a1 += 16;
+            uint64x2_t v2 = vld1q_u64(reinterpret_cast<const uint64_t*>(a2));
+            a2 += 16;
+            uint64x2_t v3 = vld1q_u64(reinterpret_cast<const uint64_t*>(a3));
+            a3 += 16;
+
+            uint64x2_t z0 = vzip1q_u64(v0, v1);
+            uint64x2_t z1 = vzip2q_u64(v0, v1);
+            uint64x2_t z2 = vzip1q_u64(v2, v3);
+            uint64x2_t z3 = vzip2q_u64(v2, v3);
+
+            vst1q_u8(&D[0], vreinterpretq_u8_u64(z0));
+            vst1q_u8(&D[16], vreinterpretq_u8_u64(z2));
+            vst1q_u8(&D[32], vreinterpretq_u8_u64(z1));
+            vst1q_u8(&D[48], vreinterpretq_u8_u64(z3));
+
+            uint32x4_t RowSumsL_pada = vmovq_n_u32(0);
+            RowSumsL_pada = vpadalq_u16(RowSumsL_pada, vpaddlq_u8(vreinterpretq_u8_u64(z0)));
+            RowSumsL_pada = vpadalq_u16(RowSumsL_pada, vpaddlq_u8(vreinterpretq_u8_u64(z1)));
+
+            uint32x4_t RowSumsL_ext = vextq_u32(RowSumsL_pada, RowSumsL_pada, 1);
+            uint32x4_t RowSumsL_add = vaddq_u32(RowSumsL_pada, RowSumsL_ext);
+            uint32x2_t RowSumsL = {vdups_laneq_u32(RowSumsL_add, 0),
+                                   vdups_laneq_u32(RowSumsL_add, 2)};
+
+            uint32x4_t RowSumsH_pada = vmovq_n_u32(0);
+            RowSumsH_pada = vpadalq_u16(RowSumsH_pada, vpaddlq_u8(vreinterpretq_u8_u64(z2)));
+            RowSumsH_pada = vpadalq_u16(RowSumsH_pada, vpaddlq_u8(vreinterpretq_u8_u64(z3)));
+
+            uint32x4_t RowSumsH_ext = vextq_u32(RowSumsH_pada, RowSumsH_pada, 1);
+            uint32x4_t RowSumsH_add = vaddq_u32(RowSumsH_pada, RowSumsH_ext);
+            uint32x2_t RowSumsH = {vdups_laneq_u32(RowSumsH_add, 0),
+                                   vdups_laneq_u32(RowSumsH_add, 2)};
+
+            RowSums = vaddq_u32(RowSums, vcombine_u32(RowSumsL, RowSumsH));
+
+            D += 64;
+            k -= 16;
+        }
+
+        while (k >= 8) {
+            uint64x1_t v0 = *reinterpret_cast<const uint64x1_t*>(a0);
+            a0 += 8;
+            uint64x1_t v1 = *reinterpret_cast<const uint64x1_t*>(a1);
+            a1 += 8;
+            uint64x1_t v2 = *reinterpret_cast<const uint64x1_t*>(a2);
+            a2 += 8;
+            uint64x1_t v3 = *reinterpret_cast<const uint64x1_t*>(a3);
+            a3 += 8;
+
+            *reinterpret_cast<uint64x1_t*>(&D[0]) = v0;
+            *reinterpret_cast<uint64x1_t*>(&D[8]) = v1;
+            *reinterpret_cast<uint64x1_t*>(&D[16]) = v2;
+            *reinterpret_cast<uint64x1_t*>(&D[24]) = v3;
+
+            uint64x2_t z01 = vcombine_u64(v0, v1);
+            uint64x2_t z23 = vcombine_u64(v2, v3);
+
+            uint32x4_t RowSumsL_pada = vmovq_n_u32(0);
+            RowSumsL_pada = vpadalq_u16(RowSumsL_pada, vpaddlq_u8(vreinterpretq_u8_u64(z01)));
+
+            uint32x4_t RowSumsL_ext = vextq_u32(RowSumsL_pada, RowSumsL_pada, 1);
+            uint32x4_t RowSumsL_add = vaddq_u32(RowSumsL_pada, RowSumsL_ext);
+            uint32x2_t RowSumsL = {vdups_laneq_u32(RowSumsL_add, 0),
+                                   vdups_laneq_u32(RowSumsL_add, 2)};
+
+            uint32x4_t RowSumsH_pada = vmovq_n_u32(0);
+            RowSumsH_pada = vpadalq_u16(RowSumsH_pada, vpaddlq_u8(vreinterpretq_u8_u64(z23)));
+
+            uint32x4_t RowSumsH_ext = vextq_u32(RowSumsH_pada, RowSumsH_pada, 1);
+            uint32x4_t RowSumsH_add = vaddq_u32(RowSumsH_pada, RowSumsH_ext);
+            uint32x2_t RowSumsH = {vdups_laneq_u32(RowSumsH_add, 0),
+                                   vdups_laneq_u32(RowSumsH_add, 2)};
+
+            RowSums = vaddq_u32(RowSums, vcombine_u32(RowSumsL, RowSumsH));
+
+            D += 32;
+            k -= 8;
+        }
+
+        if (k > 0) {
+            //
+            // Copy the remaining bytes with zero padding.
+            //
+            uint8_t* d = D;
+
+            vst1q_u8(d, vmovq_n_u8(0));
+            vst1q_u8(&d[16], vmovq_n_u8(0));
+
+            while (k > 0) {
+                d[0] = *a0++;
+                d[8] = *a1++;
+                d[16] = *a2++;
+                d[24] = *a3++;
+                d += 1;
+                k -= 1;
+            }
+
+            d = D;
+            uint64x1_t v0 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v1 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v2 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v3 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+
+            uint64x2_t z01 = vcombine_u64(v0, v1);
+            uint64x2_t z23 = vcombine_u64(v2, v3);
+
+            uint32x4_t RowSums0L_pada = vmovq_n_u32(0);
+            RowSums0L_pada = vpadalq_u16(RowSums0L_pada, vpaddlq_u8(vreinterpretq_u8_u64(z01)));
+
+            uint32x4_t RowSums0L_ext = vextq_u32(RowSums0L_pada, RowSums0L_pada, 1);
+            uint32x4_t RowSums0L_add = vaddq_u32(RowSums0L_pada, RowSums0L_ext);
+            uint32x2_t RowSums0L = {vdups_laneq_u32(RowSums0L_add, 0),
+                                    vdups_laneq_u32(RowSums0L_add, 2)};
+
+            uint32x4_t RowSums0H_pada = vmovq_n_u32(0);
+            RowSums0H_pada = vpadalq_u16(RowSums0H_pada, vpaddlq_u8(vreinterpretq_u8_u64(z23)));
+
+            uint32x4_t RowSums0H_ext = vextq_u32(RowSums0H_pada, RowSums0H_pada, 1);
+            uint32x4_t RowSums0H_add = vaddq_u32(RowSums0H_pada, RowSums0H_ext);
+            uint32x2_t RowSums0H = {vdups_laneq_u32(RowSums0H_add, 0),
+                                    vdups_laneq_u32(RowSums0H_add, 2)};
+
+            RowSums = vaddq_u32(RowSums, vcombine_u32(RowSums0L, RowSums0H));
+
+            D += 32;
+        }
+
+        vst1q_s32(RowSumBuffer, vreinterpretq_s32_u32(RowSums));
+        RowSumBuffer += 4;
+
+        A = A + lda * 4;
+        CountM -= 4;
+    }
+
+    //
+    // Process two rows of matrix A.
+    //
+    // The buffer is packed as a series of 16 byte vectors where two rows are
+    // interleaved with the following pattern:
+    //
+    //      [ A0 A1 A2 A3 A4 A5 A6 A7 ]
+    //      [ B0 B1 B2 B3 B4 B5 B6 B7 ]
+    //
+    // This pattern is repeated (CountK / 8) times.
+    //
+    // If CountK is not aligned to a multiple of eight, then the vector is padded
+    // with zeroes.
+    //
+
+    if (CountM >= 2) {
+        const uint8_t* a0 = A;
+        const uint8_t* a1 = a0 + lda;
+
+        size_t k = CountK;
+        uint32x2_t RowSums = vmov_n_u32(0);
+
+        while (k >= 16) {
+            uint64x2_t v0 = vld1q_u64(reinterpret_cast<const uint64_t*>(a0));
+            a0 += 16;
+            uint64x2_t v1 = vld1q_u64(reinterpret_cast<const uint64_t*>(a1));
+            a1 += 16;
+
+            uint64x2_t z0 = vzip1q_u64(v0, v1);
+            uint64x2_t z1 = vzip2q_u64(v0, v1);
+
+            vst1q_u8(&D[0], vreinterpretq_u8_u64(z0));
+            vst1q_u8(&D[16], vreinterpretq_u8_u64(z1));
+
+            uint32x4_t RowSumsL_pada = vmovq_n_u32(0);
+            RowSumsL_pada = vpadalq_u16(RowSumsL_pada, vpaddlq_u8(vreinterpretq_u8_u64(z0)));
+            RowSumsL_pada = vpadalq_u16(RowSumsL_pada, vpaddlq_u8(vreinterpretq_u8_u64(z1)));
+
+            uint32x4_t RowSumsL_ext = vextq_u32(RowSumsL_pada, RowSumsL_pada, 1);
+            uint32x4_t RowSumsL_add = vaddq_u32(RowSumsL_pada, RowSumsL_ext);
+            uint32x2_t RowSumsL = {vdups_laneq_u32(RowSumsL_add, 0),
+                                   vdups_laneq_u32(RowSumsL_add, 2)};
+
+            RowSums = vadd_u32(RowSums, RowSumsL);
+
+            D += 32;
+            k -= 16;
+        }
+
+        while (k >= 8) {
+            uint64x1_t v0 = *reinterpret_cast<const uint64x1_t*>(a0);
+            a0 += 8;
+            uint64x1_t v1 = *reinterpret_cast<const uint64x1_t*>(a1);
+            a1 += 8;
+
+            *reinterpret_cast<uint64x1_t*>(&D[0]) = v0;
+            *reinterpret_cast<uint64x1_t*>(&D[8]) = v1;
+
+            uint64x2_t z01 = vcombine_u64(v0, v1);
+            uint32x4_t RowSumsL_pada = vmovq_n_u32(0);
+            RowSumsL_pada = vpadalq_u16(RowSumsL_pada, vpaddlq_u8(vreinterpretq_u8_u64(z01)));
+
+            uint32x4_t RowSumsL_ext = vextq_u32(RowSumsL_pada, RowSumsL_pada, 1);
+            uint32x4_t RowSumsL_add = vaddq_u32(RowSumsL_pada, RowSumsL_ext);
+            uint32x2_t RowSumsL = {vdups_laneq_u32(RowSumsL_add, 0),
+                                   vdups_laneq_u32(RowSumsL_add, 2)};
+
+            RowSums = vadd_u32(RowSums, RowSumsL);
+
+            D += 16;
+            k -= 8;
+        }
+
+        if (k > 0) {
+            //
+            // Zero pad the remaining elements to make 8 columns.
+            //
+
+            uint8_t* d = PaddedMatrixAData;
+            vst1q_u8(PaddedMatrixAData, vmovq_n_u8(0));
+
+            while (k > 0) {
+                d[0] = *a0++;
+                d[8] = *a1++;
+
+                d += 1;
+                k -= 1;
+            }
+
+            d = PaddedMatrixAData;
+            uint64x1_t v0 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+            uint64x1_t v1 = *reinterpret_cast<const uint64x1_t*>(d);
+            d = d + 8;
+
+            uint64x2_t z01 = vcombine_u64(v0, v1);
+            uint32x4_t RowSumsL_pada = vmovq_n_u32(0);
+            RowSumsL_pada = vpadalq_u16(RowSumsL_pada, vpaddlq_u8(vreinterpretq_u8_u64(z01)));
+
+            uint32x4_t RowSumsL_ext = vextq_u32(RowSumsL_pada, RowSumsL_pada, 1);
+            uint32x4_t RowSumsL_add = vaddq_u32(RowSumsL_pada, RowSumsL_ext);
+            uint32x2_t RowSumsL = {vdups_laneq_u32(RowSumsL_add, 0),
+                                   vdups_laneq_u32(RowSumsL_add, 2)};
+
+            RowSums = vadd_u32(RowSums, RowSumsL);
+
+            uint8x16_t PackedVector = vld1q_u8(PaddedMatrixAData);
+            vst1q_u8(D, PackedVector);
+
+            D += 16;
+        }
+
+        vst1_s32(RowSumBuffer, vreinterpret_s32_u32(RowSums));
+        RowSumBuffer += 2;
+
+        A = A + lda * 2;
+        CountM -= 2;
+    }
+
+    //
+    // Process one row of matrix A.
+    //
+    // The buffer is packed as a series of 8 byte with the following pattern:
+    //
+    //      [ A0 A1 A2 A3 A4 A5 A6 A7 ]
+    //
+    // This pattern is repeated (CountK / 8) times.
+    //
+    // If CountK is not aligned to a multiple of 8, then the vector is padded
+    // with zeroes.
+    //
+
+    if (CountM > 0) {
+        // No need to pad the rows to 2, the .S takes care of zero pdding
+        const uint8_t* a = A;
+        size_t k = CountK;
+        uint32x4_t RowSums = vmovq_n_u32(0);
+
+        while (k >= 16) {
+            uint8x16_t v = vld1q_u8(a);
+            a += 16;
+
+            vst1q_u8(D, v);
+
+            RowSums = vpadalq_u16(RowSums, vpaddlq_u8(v));
+
+            D += 16;
+            k -= 16;
+        }
+
+        if (k > 0) {
+            //
+            // Copy the remaining bytes to the zero padded stack buffer.
+            //
+
+            vst1q_u8(PaddedMatrixAData, vmovq_n_u8(0));
+
+            for (size_t kk = 0; kk < k; kk++) {
+                PaddedMatrixAData[kk] = a[kk];
+            }
+
+            uint8x16_t v = vld1q_u8(PaddedMatrixAData);
+            vst1q_u8(D, v);
+
+            RowSums = vpadalq_u16(RowSums, vpaddlq_u8(v));
+        }
+
+        *RowSumBuffer = int32_t(vaddvq_u32(RowSums));
+    }
+}
+
+MLAS_FORCEINLINE
+void
+MlasGemmU8X8CopyPackBProcessUmmla(MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedBType* D,
+                                  uint8x8_t BytesRow[8],
+                                  uint8x16_t BitFlipVector,
+                                  uint32x4_t ColumnSums[2])
+{
+    uint8x16_t v02 = veorq_u8(vcombine_u8(BytesRow[0], BytesRow[2]), BitFlipVector);
+    uint8x16_t v13 = veorq_u8(vcombine_u8(BytesRow[1], BytesRow[3]), BitFlipVector);
+
+    uint8x16_t v46 = veorq_u8(vcombine_u8(BytesRow[4], BytesRow[6]), BitFlipVector);
+    uint8x16_t v57 = veorq_u8(vcombine_u8(BytesRow[5], BytesRow[7]), BitFlipVector);
+
+    uint8x16x2_t zw1 = vzipq_u8(v02, v13);
+    uint16x8x2_t zd1 =
+        vzipq_u16(vreinterpretq_u16_u8(zw1.val[0]), vreinterpretq_u16_u8(zw1.val[1]));
+
+    uint8x16x2_t zw2 = vzipq_u8(v46, v57);
+    uint16x8x2_t zd2 =
+        vzipq_u16(vreinterpretq_u16_u8(zw2.val[0]), vreinterpretq_u16_u8(zw2.val[1]));
+
+    uint32x4x2_t zd3 =
+        vzipq_u32(vreinterpretq_u32_u16(zd1.val[0]), vreinterpretq_u32_u16(zd2.val[0]));
+    uint32x4x2_t zd4 =
+        vzipq_u32(vreinterpretq_u32_u16(zd1.val[1]), vreinterpretq_u32_u16(zd2.val[1]));
+
+    vst1q_u8(&D[0], vreinterpretq_u8_u32(zd3.val[0]));
+    vst1q_u8(&D[16], vreinterpretq_u8_u32(zd3.val[1]));
+    vst1q_u8(&D[32], vreinterpretq_u8_u32(zd4.val[0]));
+    vst1q_u8(&D[48], vreinterpretq_u8_u32(zd4.val[1]));
+
+    uint32x4_t ColSums0L_pada = vmovq_n_u32(0);
+    ColSums0L_pada = vpadalq_u16(ColSums0L_pada, vpaddlq_u8(vreinterpretq_u8_u32(zd3.val[0])));
+    uint32x4_t ColSums0L_ext = vextq_u32(ColSums0L_pada, ColSums0L_pada, 1);
+    uint32x4_t ColSums0L_add = vaddq_u32(ColSums0L_pada, ColSums0L_ext);
+    uint32x2_t ColSums0L = {vdups_laneq_u32(ColSums0L_add, 0), vdups_laneq_u32(ColSums0L_add, 2)};
+
+    uint32x4_t ColSums0H_pada = vmovq_n_u32(0);
+    ColSums0H_pada = vpadalq_u16(ColSums0H_pada, vpaddlq_u8(vreinterpretq_u8_u32(zd3.val[1])));
+    uint32x4_t ColSums0H_ext = vextq_u32(ColSums0H_pada, ColSums0H_pada, 1);
+    uint32x4_t ColSums0H_add = vaddq_u32(ColSums0H_pada, ColSums0H_ext);
+    uint32x2_t ColSums0H = {vdups_laneq_u32(ColSums0H_add, 0), vdups_laneq_u32(ColSums0H_add, 2)};
+
+    ColumnSums[0] = vaddq_u32(ColumnSums[0], vcombine_u32(ColSums0L, ColSums0H));
+
+    uint32x4_t ColSums1L_pada = vmovq_n_u32(0);
+    ColSums1L_pada = vpadalq_u16(ColSums1L_pada, vpaddlq_u8(vreinterpretq_u8_u32(zd4.val[0])));
+    uint32x4_t ColSums1L_ext = vextq_u32(ColSums1L_pada, ColSums1L_pada, 1);
+    uint32x4_t ColSums1L_add = vaddq_u32(ColSums1L_pada, ColSums1L_ext);
+    uint32x2_t ColSums1L = {vdups_laneq_u32(ColSums1L_add, 0), vdups_laneq_u32(ColSums1L_add, 2)};
+
+    uint32x4_t ColSums1H_pada = vmovq_n_u32(0);
+    ColSums1H_pada = vpadalq_u16(ColSums1H_pada, vpaddlq_u8(vreinterpretq_u8_u32(zd4.val[1])));
+    uint32x4_t ColSums1H_ext = vextq_u32(ColSums1H_pada, ColSums1H_pada, 1);
+    uint32x4_t ColSums1H_add = vaddq_u32(ColSums1H_pada, ColSums1H_ext);
+    uint32x2_t ColSums1H = {vdups_laneq_u32(ColSums1H_add, 0), vdups_laneq_u32(ColSums1H_add, 2)};
+
+    ColumnSums[1] = vaddq_u32(ColumnSums[1], vcombine_u32(ColSums1L, ColSums1H));
+}
+
+template <>
+void
+MlasGemmQuantCopyPackB<MLAS_GEMM_U8X8_KERNEL_UMMLA>(MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedBType* D,
+                                                    const uint8_t* B,
+                                                    size_t ldb,
+                                                    size_t CountN,
+                                                    size_t CountK,
+                                                    int32_t* ColumnSumBuffer,
+                                                    bool BIsSigned)
+{
+    const uint8x16_t BitFlipVector = vdupq_n_u8(BIsSigned ? 0x80 : 0);
+    uint8x8_t BytesRow[8];
+
+    //
+    // Copy data from matrix B into the destination buffer 8x2 blocks at a
+    // time.
+    //
+    //
+    while (CountN >= 8) {
+        const uint8_t* b = B;
+        size_t k = CountK;
+        uint32x4_t ColumnSums[2];
+        ColumnSums[0] = vmovq_n_u32(0);
+        ColumnSums[1] = vmovq_n_u32(0);
+
+        while (k >= 8) {
+            BytesRow[0] = vld1_u8(&b[ldb * 0]);
+            BytesRow[1] = vld1_u8(&b[ldb * 1]);
+            BytesRow[2] = vld1_u8(&b[ldb * 2]);
+            BytesRow[3] = vld1_u8(&b[ldb * 3]);
+            BytesRow[4] = vld1_u8(&b[ldb * 4]);
+            BytesRow[5] = vld1_u8(&b[ldb * 5]);
+            BytesRow[6] = vld1_u8(&b[ldb * 6]);
+            BytesRow[7] = vld1_u8(&b[ldb * 7]);
+
+            MlasGemmU8X8CopyPackBProcessUmmla(D, BytesRow, BitFlipVector, ColumnSums);
+
+            D += 64;
+            b += ldb * 8;
+            k -= 8;
+        }
+
+        if (k > 0) {
+            // Pad k to 8
+
+            BytesRow[0] = vld1_u8(&b[ldb * 0]);
+            BytesRow[1] = (k >= 2) ? vld1_u8(&b[ldb * 1]) : vget_low_u8(BitFlipVector);
+            BytesRow[2] = (k >= 3) ? vld1_u8(&b[ldb * 2]) : vget_low_u8(BitFlipVector);
+            BytesRow[3] = (k >= 4) ? vld1_u8(&b[ldb * 3]) : vget_low_u8(BitFlipVector);
+            BytesRow[4] = (k >= 5) ? vld1_u8(&b[ldb * 4]) : vget_low_u8(BitFlipVector);
+            BytesRow[5] = (k >= 6) ? vld1_u8(&b[ldb * 5]) : vget_low_u8(BitFlipVector);
+            BytesRow[6] = (k >= 7) ? vld1_u8(&b[ldb * 6]) : vget_low_u8(BitFlipVector);
+            BytesRow[7] = vget_low_u8(BitFlipVector);
+
+            MlasGemmU8X8CopyPackBProcessUmmla(D, BytesRow, BitFlipVector, ColumnSums);
+
+            D += 64;
+        }
+
+        // Zero pad the output buffer to a multiple of PackedK if the above
+        // processed an odd number of four row bundles.
+        //
+        vst1q_s32(&ColumnSumBuffer[0], vreinterpretq_s32_u32(ColumnSums[0]));
+        vst1q_s32(&ColumnSumBuffer[4], vreinterpretq_s32_u32(ColumnSums[1]));
+
+        ColumnSumBuffer += 8;
+
+        B += 8;
+        CountN -= 8;
+    }
+
+    //
+    // Process the remaining columns of matrix B.
+    //
+
+    if (CountN > 0) {
+        const uint8_t* b = B;
+        size_t k = CountK;
+        uint8_t PaddedMatrixBData[64];
+        uint32x4_t ColumnSums[2];
+
+        vst1q_u8(&PaddedMatrixBData[0], BitFlipVector);
+        vst1q_u8(&PaddedMatrixBData[16], BitFlipVector);
+        vst1q_u8(&PaddedMatrixBData[32], BitFlipVector);
+        vst1q_u8(&PaddedMatrixBData[48], BitFlipVector);
+
+        ColumnSums[0] = vmovq_n_u32(0);
+        ColumnSums[1] = vmovq_n_u32(0);
+
+        //
+        // Interleave rows of matrix B using an intermediate zero padded stack
+        // buffer and write to the packed buffer.
+        //
+
+        while (k > 0) {
+            const uint8_t* bcopy0 = &b[ldb * 0];
+            const uint8_t* bcopy1 = &b[ldb * 1];
+            const uint8_t* bcopy2 = &b[ldb * 2];
+            const uint8_t* bcopy3 = &b[ldb * 3];
+            const uint8_t* bcopy4 = &b[ldb * 4];
+            const uint8_t* bcopy5 = &b[ldb * 5];
+            const uint8_t* bcopy6 = &b[ldb * 6];
+            const uint8_t* bcopy7 = &b[ldb * 7];
+
+            if (k >= 8) {
+                b += ldb * 8;
+                k -= 8;
+
+            } else {
+                vst1q_u8(&PaddedMatrixBData[0], BitFlipVector);
+                vst1q_u8(&PaddedMatrixBData[16], BitFlipVector);
+                vst1q_u8(&PaddedMatrixBData[32], BitFlipVector);
+                vst1q_u8(&PaddedMatrixBData[48], BitFlipVector);
+
+                bcopy1 = (k >= 2) ? bcopy1 : &PaddedMatrixBData[56];
+                bcopy2 = (k >= 3) ? bcopy2 : &PaddedMatrixBData[56];
+                bcopy3 = (k >= 4) ? bcopy3 : &PaddedMatrixBData[56];
+                bcopy4 = (k >= 5) ? bcopy4 : &PaddedMatrixBData[56];
+                bcopy5 = (k >= 6) ? bcopy5 : &PaddedMatrixBData[56];
+                bcopy6 = (k >= 7) ? bcopy6 : &PaddedMatrixBData[56];
+                bcopy7 = &PaddedMatrixBData[56];
+
+                k = 0;
+            }
+
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;
+            do {
+                padded[0] = *bcopy0++;
+                padded[8] = *bcopy1++;
+                padded[16] = *bcopy2++;
+                padded[24] = *bcopy3++;
+                padded[32] = *bcopy4++;
+                padded[40] = *bcopy5++;
+                padded[48] = *bcopy6++;
+                padded[56] = *bcopy7++;
+
+            } while (++padded < padded_end);
+
+            BytesRow[0] = vld1_u8(&PaddedMatrixBData[0]);
+            BytesRow[1] = vld1_u8(&PaddedMatrixBData[8]);
+            BytesRow[2] = vld1_u8(&PaddedMatrixBData[16]);
+            BytesRow[3] = vld1_u8(&PaddedMatrixBData[24]);
+            BytesRow[4] = vld1_u8(&PaddedMatrixBData[32]);
+            BytesRow[5] = vld1_u8(&PaddedMatrixBData[40]);
+            BytesRow[6] = vld1_u8(&PaddedMatrixBData[48]);
+            BytesRow[7] = vld1_u8(&PaddedMatrixBData[56]);
+
+            MlasGemmU8X8CopyPackBProcessUmmla(D, BytesRow, BitFlipVector, ColumnSums);
+
+            D += 64;
+        }
+
+        vst1q_s32(&ColumnSumBuffer[0], vreinterpretq_s32_u32(ColumnSums[0]));
+        vst1q_s32(&ColumnSumBuffer[4], vreinterpretq_s32_u32(ColumnSums[1]));
+    }
+}
+
+template <>
+MLAS_FORCEINLINE size_t
+MlasGemmQuantKernel<MLAS_GEMM_U8X8_KERNEL_UMMLA>(const MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedAType* A,
+                                                 const MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedBType* B,
+                                                 int32_t* C,
+                                                 size_t PackedCountK,
+                                                 size_t CountM,
+                                                 size_t CountN,
+                                                 size_t ldc,
+                                                 const int32_t* RowSumBuffer,
+                                                 const int32_t* ColumnSumBuffer,
+                                                 const int32_t* ZeroPointB,
+                                                 bool ZeroMode)
+{
+    size_t RowsHandled;
+
+    if (ZeroMode) {
+        RowsHandled = MlasGemmU8X8KernelUmmlaZero(A, B, C, PackedCountK, CountM, CountN, ldc,
+                                                  RowSumBuffer, ColumnSumBuffer, ZeroPointB);
+    } else {
+        RowsHandled = MlasGemmU8X8KernelUmmlaAdd(A, B, C, PackedCountK, CountM, CountN, ldc,
+                                                 RowSumBuffer, ColumnSumBuffer, ZeroPointB);
+    }
+
+    return RowsHandled;
+}
+
+const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchUmmla = {
+    MlasGemmQuantOperation<MLAS_GEMM_U8X8_KERNEL_UMMLA>,
+    MlasGemmQuantPackedOperation<MLAS_GEMM_U8X8_KERNEL_UMMLA>,
+    MlasGemmQuantCopyPackB<MLAS_GEMM_U8X8_KERNEL_UMMLA>,
+    MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedK,
+    MLAS_GEMM_U8X8_KERNEL_UMMLA::PackedStrides.K,
+    8};
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index c4416068e2457..5a441b1d1701e 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -50,6 +50,7 @@
 #include "core/optimizer/matmul_integer_to_float.h"
 #include "core/optimizer/matmul_scale_fusion.h"
 #include "core/optimizer/matmul_transpose_fusion.h"
+#include "core/optimizer/matmul_bn_fusion.h"
 #include "core/optimizer/nchwc_transformer.h"
 #include "core/optimizer/noop_elimination.h"
 #include "core/optimizer/not_where_fusion.h"
@@ -127,6 +128,7 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
       rules.push_back(std::make_unique<ConvAddFusion>());
       rules.push_back(std::make_unique<ConvMulFusion>());
       rules.push_back(std::make_unique<ConvBNFusion>());
+      rules.push_back(std::make_unique<MatmulBNFusion>());
       rules.push_back(std::make_unique<ClipQuantFusion>());
       rules.push_back(std::make_unique<ReluQuantFusion>());
       break;
diff --git a/onnxruntime/core/optimizer/initializer.cc b/onnxruntime/core/optimizer/initializer.cc
index c8da15f65a6d7..9e807ddc7be59 100644
--- a/onnxruntime/core/optimizer/initializer.cc
+++ b/onnxruntime/core/optimizer/initializer.cc
@@ -291,7 +291,11 @@ Initializer& Initializer::sqrt() {
 namespace {
 template <typename T>
 struct ScaleByAxis {
-  void operator()(Tensor& data, const Tensor& scalers, const size_t block_size, const size_t num_blocks) const {
+  void operator()(Tensor& data,
+                  const Tensor& scalers,
+                  const size_t block_size,
+                  const size_t num_blocks,
+                  const bool column_major) const {
     ToNumeric<T> to_numeric;
     const auto scaler_size = scalers.Shape().Size();
     T* dst = data.MutableData<T>();
@@ -303,24 +307,32 @@ struct ScaleByAxis {
       }
     } else {
       for (size_t block_offset = 0, i = 0; i < num_blocks; i++) {
-        const auto numeric_scaler = to_numeric(scalers_data[i]);
-        for (size_t j = 0; j < block_size; ++j, ++block_offset) {
-          dst[block_offset] = T(to_numeric(dst[block_offset]) * numeric_scaler);
+        if (column_major) {
+          for (size_t j = 0; j < block_size; ++j, ++block_offset) {
+            const auto numeric_scaler = to_numeric(scalers_data[j]);
+            dst[block_offset] = T(to_numeric(dst[block_offset]) * numeric_scaler);
+          }
+        } else {
+          const auto numeric_scaler = to_numeric(scalers_data[i]);
+          for (size_t j = 0; j < block_size; ++j, ++block_offset) {
+            dst[block_offset] = T(to_numeric(dst[block_offset]) * numeric_scaler);
+          }
         }
       }
     }
   }
 };
-
 }  // namespace
 
-void Initializer::scale_by_axis(const Initializer& scalers, int axis) {
+void Initializer::scale_by_axis(const Initializer& scalers, int axis, bool column_major) {
   ORT_ENFORCE(axis >= 0, "Axis must be non-negative");
   const size_t block_size = narrow<size_t>(data_.Shape().SizeFromDimension(gsl::narrow_cast<size_t>(axis)));
   const size_t num_blocks = size() / block_size;
-  ORT_ENFORCE(scalers.size() == 1 || scalers.size() == num_blocks, "Invalid other(scalers) size");
+  ORT_ENFORCE(scalers.size() == 1 ||
+                  (column_major ? scalers.size() == block_size : scalers.size() == num_blocks),
+              "Invalid other(scalers) size");
   utils::MLTypeCallDispatcher<MLFloat16, BFloat16, float, double, int32_t, int64_t> t_disp(data_.GetElementType());
-  t_disp.Invoke<ScaleByAxis>(data_, scalers.data_, block_size, num_blocks);
+  t_disp.Invoke<ScaleByAxis>(data_, scalers.data_, block_size, num_blocks, column_major);
 }
 #endif  // ORT_EXTENDED_MINIMAL_BUILD
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/initializer.h b/onnxruntime/core/optimizer/initializer.h
index dfe054ba1aced..78e3fd6a3d24e 100644
--- a/onnxruntime/core/optimizer/initializer.h
+++ b/onnxruntime/core/optimizer/initializer.h
@@ -86,7 +86,7 @@ class Initializer final {
 
   Initializer& sqrt();
 
-  void scale_by_axis(const Initializer& other, int axis);
+  void scale_by_axis(const Initializer& other, int axis, bool column_major = false);
 #endif  // ORT_EXTENDED_MINIMAL_BUILD
  private:
   std::string name_;
diff --git a/onnxruntime/core/optimizer/matmul_bn_fusion.cc b/onnxruntime/core/optimizer/matmul_bn_fusion.cc
new file mode 100644
index 0000000000000..e944522c9c338
--- /dev/null
+++ b/onnxruntime/core/optimizer/matmul_bn_fusion.cc
@@ -0,0 +1,230 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/matmul_bn_fusion.h"
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/utils.h"
+
+namespace onnxruntime {
+
+namespace {
+const std::vector<std::pair<std::string, InlinedVector<ONNX_NAMESPACE::OperatorSetVersion>>> ignorable_nodes{
+    {"Reshape", {1, 5, 13, 14, 19}},
+    {"Transpose", {1, 13}}};
+const std::pair<std::string, InlinedVector<ONNX_NAMESPACE::OperatorSetVersion>> dest = {"BatchNormalization", {1, 6, 7, 9, 14, 15}};
+}  // namespace
+
+bool NodeIsIgnorable(const Graph& graph, const Node& root_node, NodeIndex curr_node_index) {
+  const Node* curr_node = graph.GetNode(curr_node_index);
+
+  // curr_node has different execution provider then it's parent or
+  // has output edge != 1 (this condition will handle the case when ignorable node
+  // is graph output i.e. a graph like this "MatMul->Transpose")
+  if (curr_node->GetExecutionProviderType() != root_node.GetExecutionProviderType() ||
+      curr_node->GetOutputEdgesCount() != 1) {
+    return false;
+  }
+
+  // curr_node can be any of the ignorable_nodes.
+  for (size_t index = 0; index < ignorable_nodes.size(); index++) {
+    if (graph_utils::IsSupportedOptypeVersionAndDomain(*curr_node, ignorable_nodes[index].first, ignorable_nodes[index].second)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+std::optional<NodeIndex> MatchPath(const Graph& graph, const Node& root_node, NodeIndex curr_node_index) {
+  while (NodeIsIgnorable(graph, root_node, curr_node_index)) {
+    curr_node_index = graph.GetNode(curr_node_index)->OutputNodesBegin()->Index();
+  }
+
+  // curr_node is neither ignorable nor dest
+  const Node* curr_node = graph.GetNode(curr_node_index);
+  if (curr_node->OpType() != dest.first) {
+    return std::nullopt;
+  }
+
+  if (curr_node->GetExecutionProviderType() == root_node.GetExecutionProviderType() &&
+      graph_utils::IsSupportedOptypeVersionAndDomain(*curr_node, dest.first, dest.second)) {
+    return curr_node_index;
+  }
+
+  // either curr_node has different execution provider or
+  // has invalid opset.
+  return std::nullopt;
+}
+
+/*
+ *   Given a MatMul node, it will verify the following pattern.
+ *                MatMul                  GEMM
+ *                  |                       |
+ *               Reshape ^     --->      Reshape ^
+ *                  |                       |
+ *             Transpose ^             Transpose ^
+ *                  |
+ *        BatchNormalization
+ * Note: ^ means there can be 0 or any occurrences of that node.
+ * Few example fusable pattern:
+ *  - MatMul -> Reshape -> Transpose -> BatchNormalization              ---> GEMM -> Reshape -> Transpose
+ *  - MatMul -> Reshape -> BatchNormalization                           ---> GEMM -> Reshape
+ *  - MatMul -> Transpose -> BatchNormalization                         ---> GEMM -> Transpose
+ *  - MatMul -> Reshape -> Reshape -> BatchNormalization                ---> GEMM -> Reshape -> Reshape
+ *  - MatMul -> Reshape -> Transpose -> Reshape -> BatchNormalization   ---> GEMM -> Reshape -> Transpose -> Reshape
+ *  - MatMul -> BatchNormalization                                      ---> GEMM
+ * Other Conditions:
+ *   - B tensor of MatMul should be constant.
+ *   - scale, B, mean, var tensors of BatchNormalization should be constant.
+ *   - Every node in the path, except the BatchNormalization, should have only 1 output edge.
+ */
+bool MatmulBNFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger&) const {
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "MatMul", {1, 9, 13}) ||
+      node.GetOutputEdgesCount() != 1) {
+    return false;
+  }
+
+  if (graph.NodeProducesGraphOutput(node)) {
+    return false;
+  }
+
+  // because <node> is not producing graph output, it means it will have a child node
+  NodeIndex child_node_index = node.OutputNodesBegin()->Index();
+  std::optional<NodeIndex> batch_norm_index = MatchPath(graph, node, child_node_index);
+  if (!batch_norm_index.has_value()) {
+    return false;
+  }
+
+  const Node* batch_norm_node = graph.GetNode(*batch_norm_index);
+
+  // Check that the appropriate inputs to the Matmul and BN nodes are constants.
+  if (!graph_utils::NodeArgIsConstant(graph, *node.InputDefs()[1]) ||
+      !graph_utils::NodeArgIsConstant(graph, *batch_norm_node->InputDefs()[1]) ||
+      !graph_utils::NodeArgIsConstant(graph, *batch_norm_node->InputDefs()[2]) ||
+      !graph_utils::NodeArgIsConstant(graph, *batch_norm_node->InputDefs()[3]) ||
+      !graph_utils::NodeArgIsConstant(graph, *batch_norm_node->InputDefs()[4])) {
+    return false;
+  }
+
+  // First output from BN is required. Others are optional. If any optional outputs exist we can't fuse.
+  const auto& output_defs = batch_norm_node->OutputDefs();
+  if (output_defs.size() > 1) {
+    for (size_t i = 1, end = output_defs.size(); i < end; ++i) {
+      if (output_defs[i] != nullptr && output_defs[i]->Exists()) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/*
+ * BatchNormalization: [https://learn.microsoft.com/en-us/windows/win32/api/directml/ns-directml-dml_batch_normalization_operator_desc]
+ *   Scale * ((Input - Mean) / sqrt(Variance + Epsilon)) + Bias // ignore the FusedActivation in the above definition, that's very specific to DML
+ * Expanding out the terms:
+ *   Output = (Scale / sqrt(Variance + Epsilon)) * Input + (Scale / sqrt(Variance + Epsilon)) * -Mean + Bias
+ * Here,
+ *   [Scale/sqrt(Variance + Epsilon)] is constant, and let's call it `alpha`
+ *   [(Scale / sqrt(Variance + Epsilon)) * -Mean + Bias] is also constant, and let's call it `beta`
+ * Output = alpha * Input + beta, Input = B tensor of MatMul.
+ *
+ */
+Status MatmulBNFusion::Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect& rule_effect, const logging::Logger&) const {
+  NodeIndex child_node_index = matmul_node.OutputNodesBegin()->Index();
+  NodeIndex batch_norm_node_index = MatchPath(graph, matmul_node, child_node_index).value();
+
+  Node& batch_norm_node = *graph.GetNode(batch_norm_node_index);  // need mutable node, that's why extracting node from graph
+
+  // only perform fusion if epsilon is present and is of float_32 type
+  auto epsilon_attribute = batch_norm_node.GetAttributes().find("epsilon");
+  if (epsilon_attribute == batch_norm_node.GetAttributes().end() ||
+      epsilon_attribute->second.type() != ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT) {
+    return Status::OK();
+  }
+  const float epsilon = epsilon_attribute->second.f();
+
+  const onnx::TensorProto* scale_tensor = graph_utils::GetConstantInitializer(graph, batch_norm_node.InputDefs()[1]->Name());
+  ORT_ENFORCE(scale_tensor);
+  const onnx::TensorProto* bias_tensor = graph_utils::GetConstantInitializer(graph, batch_norm_node.InputDefs()[2]->Name());
+  ORT_ENFORCE(bias_tensor);
+  const onnx::TensorProto* mean_tensor = graph_utils::GetConstantInitializer(graph, batch_norm_node.InputDefs()[3]->Name());
+  ORT_ENFORCE(mean_tensor);
+  const onnx::TensorProto* var_tensor = graph_utils::GetConstantInitializer(graph, batch_norm_node.InputDefs()[4]->Name());
+  ORT_ENFORCE(var_tensor);
+  const onnx::TensorProto* matmul_b_tensor = graph_utils::GetConstantInitializer(graph, matmul_node.InputDefs()[1]->Name());
+  ORT_ENFORCE(matmul_b_tensor);
+
+  if (!optimizer_utils::IsFloatingPointDataType(*matmul_b_tensor) ||
+      !optimizer_utils::IsFloatingPointDataType(*scale_tensor) ||
+      !optimizer_utils::IsFloatingPointDataType(*bias_tensor) ||
+      !optimizer_utils::IsFloatingPointDataType(*mean_tensor) ||
+      !optimizer_utils::IsFloatingPointDataType(*var_tensor) ||
+      scale_tensor->dims_size() != 1 ||
+      bias_tensor->dims_size() != 1 ||
+      mean_tensor->dims_size() != 1 ||
+      var_tensor->dims_size() != 1 ||
+      scale_tensor->dims(0) != matmul_b_tensor->dims(1) ||
+      bias_tensor->dims(0) != matmul_b_tensor->dims(1) ||
+      mean_tensor->dims(0) != matmul_b_tensor->dims(1) ||
+      var_tensor->dims(0) != matmul_b_tensor->dims(1)) {
+    return Status::OK();
+  }
+
+  /*
+   * temp = scale / sqrt(var + epsilon)
+   * output = (temp * Input) - ((temp * mean) + bias)
+   */
+  Initializer scale(*scale_tensor, graph.ModelPath());
+  Initializer bias(*bias_tensor, graph.ModelPath());
+  Initializer mean(*mean_tensor, graph.ModelPath());
+  Initializer var(*var_tensor, graph.ModelPath());
+  Initializer matmul_b(*matmul_b_tensor, graph.ModelPath());
+
+  var.add(epsilon);
+  var.sqrt();
+  scale.div(var);  // this is the temp
+  matmul_b.scale_by_axis(scale, 1, true);
+
+  mean.mul(scale);
+  bias.sub(mean);
+
+  // create B tensorProto for new Gemm node from <matmulB> initializer.
+  ONNX_NAMESPACE::TensorProto new_gemm_b_tensor(*matmul_b_tensor);
+  matmul_b.ToProto(new_gemm_b_tensor);
+  const std::string new_gemm_b_name = graph.GenerateNodeArgName("MatMulBnFusion_GemmB_" + matmul_b_tensor->name());
+  new_gemm_b_tensor.set_name(new_gemm_b_name);
+  NodeArg& new_gemm_b_node_arg = graph_utils::AddInitializer(graph, new_gemm_b_tensor);
+
+  // create bias tensorProto for new Gemm node from <bias> initializer.
+  ONNX_NAMESPACE::TensorProto new_gemm_bias_tensor(*bias_tensor);
+  bias.ToProto(new_gemm_bias_tensor);
+  const std::string new_gemm_bias_name = graph.GenerateNodeArgName("MatMulBnFusion_GemmBias");
+  new_gemm_bias_tensor.set_name(new_gemm_bias_name);
+  NodeArg& new_gemm_bias_node_arg = graph_utils::AddInitializer(graph, new_gemm_bias_tensor);
+
+  Node& gemm_node = graph.AddNode(
+      graph.GenerateNodeArgName("MatMulBnFusion_Gemm"),
+      "Gemm",
+      "Generated from Matmul BatchNormalization fusion",
+      {matmul_node.MutableInputDefs()[0], &new_gemm_b_node_arg, &new_gemm_bias_node_arg},
+      matmul_node.MutableOutputDefs(),
+      nullptr,
+      kOnnxDomain);
+
+  // Remove MatMul node.
+  Node* node = graph.GetNode(matmul_node.Index());
+  graph_utils::RemoveNodeOutputEdges(graph, *node);
+  graph.RemoveNode(matmul_node.Index());
+
+  // Delete optional empty output defs.
+  // Delete BatchNormalization node and update the input of the child of BatchNormalization
+  batch_norm_node.MutableOutputDefs().resize(1);
+  NodeIndex batch_norm_parent_index = graph.GetNode(child_node_index)->OpType() == "BatchNormalization" ? gemm_node.Index() : batch_norm_node.InputNodesBegin()->Index();
+  graph_utils::FinalizeNodeFusion(graph, *graph.GetNode(batch_norm_parent_index), batch_norm_node);
+
+  rule_effect = RewriteRuleEffect::kRemovedCurrentNode;
+  return Status::OK();
+}
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/matmul_bn_fusion.h b/onnxruntime/core/optimizer/matmul_bn_fusion.h
new file mode 100644
index 0000000000000..7a43483cf37d4
--- /dev/null
+++ b/onnxruntime/core/optimizer/matmul_bn_fusion.h
@@ -0,0 +1,27 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/rewrite_rule.h"
+
+namespace onnxruntime {
+/*
+ *   This fusion submerges a BatchNormalization operator to it's super
+ *   precedding MatMul operator, if and only if MatmulBNFusion::SatisfyCondition()
+ *   is true.
+ */
+class MatmulBNFusion : public RewriteRule {
+ public:
+  MatmulBNFusion() : RewriteRule("MatMul_BatchNormalization_Fusion") {}
+
+  std::vector<std::string> TargetOpTypes() const noexcept override {
+    return {"MatMul"};
+  }
+
+ private:
+  bool SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& logger) const override;
+
+  Status Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override;
+};
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc
index 07f391f2ae430..ed3e35706b688 100644
--- a/onnxruntime/core/optimizer/transformer_memcpy.cc
+++ b/onnxruntime/core/optimizer/transformer_memcpy.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "transformer_memcpy.h"
+#include "core/common/logging/logging.h"
 #include "core/framework/kernel_registry_manager.h"
 #include "core/framework/execution_providers.h"
 #include "core/framework/utils.h"
@@ -16,12 +17,12 @@ class TransformerMemcpyImpl {
   TransformerMemcpyImpl(onnxruntime::Graph& graph, const std::string& provider)
       : graph_(graph), provider_(provider) {}
 
-  bool ModifyGraph(const KernelRegistryManager& schema_registries);
+  bool ModifyGraph(const KernelRegistryManager& schema_registries, const logging::Logger& logger, int& copy_node_counter);
 
  private:
   void ProcessDefs(onnxruntime::Node& node, const KernelRegistryManager& kernel_registries, InitializedTensorSet& initializers_consumed);
   void BuildDefsMapping(const onnxruntime::NodeArg* arg, const KernelRegistryManager& kernel_registries);
-  void AddCopyNode(onnxruntime::NodeArg* arg, bool is_input);
+  void AddCopyNode(onnxruntime::NodeArg* arg, bool is_input, const logging::Logger& logger);
   bool ProcessInitializers(const KernelRegistryManager& kernel_registries, const InitializedTensorSet& initializers_consumed);
 
  private:
@@ -61,11 +62,21 @@ static const onnx::TensorProto* GetInitializer(const Graph& graph, const std::st
 
 // very simple GraphTransformer that uses TransformerMemcpyImpl for each graph
 // and mainly provides the subgraph recursion functionality
-common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+common::Status MemcpyTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level,
+                                            const logging::Logger& logger) const {
   for (auto& provider : provider_types_) {
     if (!utils::ProviderIsCpuBased(provider)) {
       TransformerMemcpyImpl copy_impl(graph, provider);
-      auto current_modified = copy_impl.ModifyGraph(registry_manager_);
+
+      int copy_node_counter = 0;
+      auto current_modified = copy_impl.ModifyGraph(registry_manager_, logger, copy_node_counter);
+      if (copy_node_counter > 0 && provider == kCudaExecutionProvider) {
+        LOGS(logger, WARNING) << copy_node_counter << " Memcpy nodes are added to the graph " << graph.Name()
+                              << " for " << provider
+                              << ". It might have negative impact on performance (including unable to run CUDA graph). "
+                              << "Set session_options.log_severity_level=1 to see the detail logs before this message.";
+      }
+
       modified = modified || current_modified;
       break;
     }
@@ -111,7 +122,9 @@ This transformer does not currently optimize copies between, e.g., two different
 
 */
 
-bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_registries) {
+bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_registries,
+                                        const logging::Logger& logger,
+                                        int& copy_node_counter) {
   bool modified = false;
   InitializedTensorSet initializers_consumed;
   // find defs that require copy
@@ -137,19 +150,22 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi
     // For inputs we need to create a copy node only when the input is connected to both provider
     // and non-provider nodes. Otherwise utils::CopyInputsAcrossDevices() will do the job.
     if (provider_input_defs_.count(arg) && non_provider_input_defs_.count(arg)) {
-      AddCopyNode(const_cast<onnxruntime::NodeArg*>(arg), true);
+      AddCopyNode(const_cast<onnxruntime::NodeArg*>(arg), true, logger);
+      copy_node_counter++;
       modified = true;
     }
 
   for (auto arg : non_provider_output_defs_)
     if (provider_input_defs_.count(arg)) {
-      AddCopyNode(arg, true);
+      AddCopyNode(arg, true, logger);
+      copy_node_counter++;
       modified = true;
     }
 
   for (auto arg : provider_output_defs_)
     if (non_provider_input_defs_.count(arg)) {
-      AddCopyNode(arg, false);
+      AddCopyNode(arg, false, logger);
+      copy_node_counter++;
       modified = true;
     }
 
@@ -176,7 +192,8 @@ bool TransformerMemcpyImpl::ModifyGraph(const KernelRegistryManager& kernel_regi
         // (the name will be the same as the parent node's implicit input)
         const auto* node_arg_in_current_graph_level = *provider_input_defs_.find(arg);
 
-        AddCopyNode(const_cast<onnxruntime::NodeArg*>(node_arg_in_current_graph_level), true);
+        AddCopyNode(const_cast<onnxruntime::NodeArg*>(node_arg_in_current_graph_level), true, logger);
+        copy_node_counter++;
         modified = true;
       }
     }
@@ -297,7 +314,7 @@ void TransformerMemcpyImpl::BuildDefsMapping(const onnxruntime::NodeArg* arg, co
   }
 }
 
-void TransformerMemcpyImpl::AddCopyNode(onnxruntime::NodeArg* arg, bool is_input) {
+void TransformerMemcpyImpl::AddCopyNode(onnxruntime::NodeArg* arg, bool is_input, const logging::Logger& logger) {
   // create unique name for new def
   std::string new_def_name = graph_.GenerateNodeArgName(arg->Name() + "_" + provider_);
 
@@ -309,6 +326,9 @@ void TransformerMemcpyImpl::AddCopyNode(onnxruntime::NodeArg* arg, bool is_input
   std::string new_node_name = graph_.GenerateNodeName("Memcpy");
 
   const auto op_name = is_input ? "MemcpyFromHost" : "MemcpyToHost";
+  LOGS(logger, INFO) << "Add " << op_name << (is_input ? " after " : " before ") << arg->Name()
+                     << " for " << provider_;
+
   auto& new_node = graph_.AddNode(new_node_name, op_name, "Copy from/to host memory",
                                   std::vector<onnxruntime::NodeArg*>{src_arg},
                                   std::vector<onnxruntime::NodeArg*>{dst_arg});
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 3d03abf5b7ebc..2ca3b1cdf817e 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -365,7 +365,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, Slice);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 11, Dropout);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, IsInf);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 19, IsInf);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15, float, RoiAlign);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15, double, RoiAlign);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ReverseSequence);
@@ -682,9 +682,9 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Ga
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, ScatterND);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, ScatterElements);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, Identity);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, IsNaN);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, IsNaN);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16, IsNaN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, float, IsNaN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, double, IsNaN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, MLFloat16, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, NonZero);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, NonZero);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, NonZero);
@@ -960,6 +960,18 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Sh
 
 // Opset 20
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, ConstantOfShape);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, AffineGrid);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, AffineGrid);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN);
+#if !defined(DISABLE_FLOAT8_TYPES)
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ, IsNaN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2, IsNaN);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN);
+#endif
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf);
 
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
 
@@ -1492,7 +1504,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                     Dropout)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
                                                                     NonMaxSuppression)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 19, IsInf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15, float,
                                                                           RoiAlign)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15, double,
@@ -1981,12 +1993,12 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, ScatterElements)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, ScatterND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16,
-                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, float,
+                                                                          IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, double,
+                                                                          IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, MLFloat16,
+                                                                          IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool,
                                                                 NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
@@ -2389,6 +2401,18 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
 
     // Opset 20
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, ConstantOfShape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, AffineGrid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, AffineGrid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN)>,
+#if !defined(DISABLE_FLOAT8_TYPES)
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN)>,
+#endif
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cpu/tensor/affine_grid.cc b/onnxruntime/core/providers/cpu/tensor/affine_grid.cc
new file mode 100644
index 0000000000000..15900ba553983
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/affine_grid.cc
@@ -0,0 +1,151 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cpu/tensor/affine_grid.h"
+
+#include "core/common/common.h"
+#include "core/providers/op_kernel_type_control.h"
+#include "core/util/math_cpuonly.h"
+#include <iostream>
+#include "Eigen/src/Core/Map.h"
+#include <Eigen/Dense>
+#include "core/common/eigen_common_wrapper.h"
+
+namespace onnxruntime {
+
+#define REGISTER_KERNEL_TYPED(T)                                         \
+  ONNX_CPU_OPERATOR_TYPED_KERNEL(                                        \
+      AffineGrid,                                                        \
+      20,                                                                \
+      T,                                                                 \
+      KernelDefBuilder()                                                 \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())        \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<int64_t>()), \
+      AffineGrid<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(double)
+
+template <typename T>
+void generate_base_grid_2d(int64_t H, int64_t W, bool align_corners, Eigen::Matrix<T, Eigen::Dynamic, 2>& base_grid) {
+  Eigen::VectorXf row_vec = Eigen::VectorXf::LinSpaced(static_cast<Eigen::Index>(W), -1, 1);
+  if (!align_corners) {
+    row_vec = row_vec * (W - 1) / W;
+  }
+  Eigen::VectorXf col_vec = Eigen::VectorXf::LinSpaced(static_cast<Eigen::Index>(H), -1, 1);
+  if (!align_corners) {
+    col_vec = col_vec * (H - 1) / H;
+  }
+
+  base_grid.resize(static_cast<Eigen::Index>(H * W), 2);
+  for (Eigen::Index j = 0; j < H; j++) {
+    for (Eigen::Index i = 0; i < W; i++) {
+      base_grid.row(j * static_cast<Eigen::Index>(W) + i) << row_vec(i), col_vec(j);
+    }
+  }
+}
+
+template <typename T>
+void generate_base_grid_3d(int64_t D, int64_t H, int64_t W, bool align_corners, Eigen::Matrix<T, Eigen::Dynamic, 3>& base_grid) {
+  Eigen::VectorXf row_vec = Eigen::VectorXf::LinSpaced(static_cast<Eigen::Index>(W), -1, 1);
+  if (!align_corners) {
+    row_vec = row_vec * (W - 1) / W;
+  }
+  Eigen::VectorXf col_vec = Eigen::VectorXf::LinSpaced(static_cast<Eigen::Index>(H), -1, 1);
+  if (!align_corners) {
+    col_vec = col_vec * (H - 1) / H;
+  }
+  Eigen::VectorXf slice_vec = Eigen::VectorXf::LinSpaced(static_cast<Eigen::Index>(D), -1, 1);
+  if (!align_corners) {
+    slice_vec = slice_vec * (D - 1) / D;
+  }
+
+  base_grid.resize(static_cast<Eigen::Index>(D * H * W), 3);
+  for (Eigen::Index k = 0; k < D; k++) {
+    for (Eigen::Index j = 0; j < H; j++) {
+      for (Eigen::Index i = 0; i < W; i++) {
+        base_grid.row(k * static_cast<Eigen::Index>(H * W) + j * static_cast<Eigen::Index>(W) + i) << row_vec(i), col_vec(j), slice_vec(k);
+      }
+    }
+  }
+}
+
+template <typename T>
+void affine_grid_generator_2d(const Tensor* theta, const Eigen::Matrix<T, 2, Eigen::Dynamic>& base_grid_transposed, int64_t batch_num, int64_t H, int64_t W, Tensor* grid) {
+  const Eigen::StorageOptions option = Eigen::RowMajor;
+  auto theta_batch_offset = batch_num * 2 * 3;
+  const T* theta_data = theta->Data<T>() + theta_batch_offset;
+  const Eigen::Matrix<T, 2, 2, option> theta_R{{theta_data[0], theta_data[1]}, {theta_data[3], theta_data[4]}};
+  const Eigen::Array<T, 2, 1> theta_T(theta_data[2], theta_data[5]);
+
+  auto grid_batch_offset = batch_num * H * W * 2;
+  T* grid_data = grid->MutableData<T>() + grid_batch_offset;
+  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 2, option>> grid_matrix(grid_data, narrow<size_t>(H * W), 2);
+  grid_matrix = ((theta_R * base_grid_transposed).array().colwise() + theta_T).matrix().transpose();
+}
+
+template <typename T>
+void affine_grid_generator_3d(const Tensor* theta, const Eigen::Matrix<T, 3, Eigen::Dynamic>& base_grid_transposed, int64_t batch_num, int64_t D, int64_t H, int64_t W, Tensor* grid) {
+  const Eigen::StorageOptions option = Eigen::RowMajor;
+  auto theta_batch_offset = batch_num * 3 * 4;
+  const T* theta_data = theta->Data<T>() + theta_batch_offset;
+  const Eigen::Matrix<T, 3, 3, option> theta_R{
+      {theta_data[0], theta_data[1], theta_data[2]},
+      {theta_data[4], theta_data[5], theta_data[6]},
+      {theta_data[8], theta_data[9], theta_data[10]}};
+  const Eigen::Array<T, 3, 1> theta_T(theta_data[3], theta_data[7], theta_data[11]);
+
+  auto grid_batch_offset = batch_num * D * H * W * 3;
+  T* grid_data = grid->MutableData<T>() + grid_batch_offset;
+  Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 3, option>> grid_matrix(grid_data, narrow<size_t>(D * H * W), 3);
+  grid_matrix = ((theta_R * base_grid_transposed).array().colwise() + theta_T).matrix().transpose();
+}
+
+template <typename T>
+Status AffineGrid<T>::Compute(OpKernelContext* context) const {
+  const Tensor* theta = context->Input<Tensor>(0);
+  const TensorShape& theta_shape = theta->Shape();
+  if (theta_shape.NumDimensions() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "AffineGrid : Input theta tensor dimension is not 3");
+  }
+
+  const Tensor* size = context->Input<Tensor>(1);
+  const TensorShape& size_shape = size->Shape();
+  const int64_t* size_data = size->Data<int64_t>();
+
+  if (size_shape.GetDims()[0] == 4 /*&& get_check_2d_grid_sample_consistency(theta_shape, size_shape, N, C, H, W)*/) {
+    int64_t N = size_data[0], H = size_data[2], W = size_data[3];
+
+    TensorShape grid_shape{N, H, W, 2};
+    auto grid = context->Output(0, grid_shape);
+
+    Eigen::Matrix<T, Eigen::Dynamic, 2> base_grid;
+    generate_base_grid_2d(H, W, align_corners_, base_grid);
+    Eigen::Matrix<T, 2, Eigen::Dynamic> base_grid_transposed = base_grid.transpose();
+
+    std::function<void(ptrdiff_t)> fn = [theta, base_grid_transposed, H, W, grid](ptrdiff_t batch_num) {
+      affine_grid_generator_2d(theta, base_grid_transposed, batch_num, H, W, grid);
+    };
+
+    concurrency::ThreadPool::TryBatchParallelFor(context->GetOperatorThreadPool(), narrow<size_t>(N), std::move(fn), 0);
+  } else if (size_shape.GetDims()[0] == 5 /*&& get_check_2d_grid_sample_consistency(theta_shape, size_shape, N, C, H, W)*/) {
+    int64_t N = size_data[0], D = size_data[2], H = size_data[3], W = size_data[4];
+
+    TensorShape grid_shape{N, D, H, W, 3};
+    auto grid = context->Output(0, grid_shape);
+
+    Eigen::Matrix<T, Eigen::Dynamic, 3> base_grid;
+    generate_base_grid_3d(D, H, W, align_corners_, base_grid);
+    Eigen::Matrix<T, 3, Eigen::Dynamic> base_grid_transposed = base_grid.transpose();
+
+    std::function<void(ptrdiff_t)> fn = [theta, base_grid_transposed, D, H, W, grid](ptrdiff_t batch_num) {
+      affine_grid_generator_3d(theta, base_grid_transposed, batch_num, D, H, W, grid);
+    };
+
+    concurrency::ThreadPool::TryBatchParallelFor(context->GetOperatorThreadPool(), narrow<size_t>(N), std::move(fn), 0);
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "AffineGrid : Invalidate size - length of size should be 4 or 5.");
+  }
+  return Status::OK();
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/affine_grid.h b/onnxruntime/core/providers/cpu/tensor/affine_grid.h
new file mode 100644
index 0000000000000..5ffe660e986f2
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/affine_grid.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+template <typename T>
+class AffineGrid final : public OpKernel {
+ public:
+  AffineGrid(const OpKernelInfo& info) : OpKernel(info) {
+    int64_t align_corners = info.GetAttrOrDefault<int64_t>("align_corners", 0);
+    align_corners_ = (align_corners != 0);
+  }
+
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  bool align_corners_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/isinf.cc b/onnxruntime/core/providers/cpu/tensor/isinf.cc
index bc99caa8036cf..1b449f46927a2 100644
--- a/onnxruntime/core/providers/cpu/tensor/isinf.cc
+++ b/onnxruntime/core/providers/cpu/tensor/isinf.cc
@@ -14,15 +14,38 @@ namespace onnxruntime {
 // https://github.com/onnx/onnx/blob/main/docs/Operators.md#IsInf
 
 namespace op_kernel_type_control {
-ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES_ALL_OPSETS(
-    kCpuExecutionProvider, kOnnxDomain, IsInf, Input, 0,
-    float, double);
+using IsInfTypesOpset10 = TypeList<float, double>;
+
+ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPE_LIST(
+    kCpuExecutionProvider, kOnnxDomain, IsInf, 10, Input, 0,
+    IsInfTypesOpset10);
+
+using IsInfTypesOpset20 =
+    TypeList<
+        float,
+        double
+#if !defined(DISABLE_FLOAT8_TYPES)
+        ,
+        Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ
+#endif
+        >;
+
+ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPE_LIST(
+    kCpuExecutionProvider,
+    kOnnxDomain,
+    IsInf,
+    20,
+    Input,
+    0,
+    IsInfTypesOpset20);
 }  // namespace op_kernel_type_control
 
 class IsInf final : public OpKernel {
  public:
-  using EnabledDataTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecutionProvider, kOnnxDomain,
-                                                                          IsInf, Input, 0);
+  using EnabledDataTypes10 = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(kCpuExecutionProvider, kOnnxDomain,
+                                                                 IsInf, 10, Input, 0);
+  using EnabledDataTypes20 = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(kCpuExecutionProvider, kOnnxDomain,
+                                                                 IsInf, 20, Input, 0);
 
   explicit IsInf(const OpKernelInfo& info);
   Status Compute(OpKernelContext* context) const override;
@@ -30,14 +53,25 @@ class IsInf final : public OpKernel {
  private:
   int64_t detect_positive_{1};
   int64_t detect_negative_{1};
+  int opset_;
 };
 
-ONNX_CPU_OPERATOR_KERNEL(
+ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     IsInf,
     10,
+    19,
     KernelDefBuilder()
         .TypeConstraint("T1",
-                        BuildKernelDefConstraintsFromTypeList<IsInf::EnabledDataTypes>())
+                        BuildKernelDefConstraintsFromTypeList<IsInf::EnabledDataTypes10>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    IsInf);
+
+ONNX_CPU_OPERATOR_KERNEL(
+    IsInf,
+    20,
+    KernelDefBuilder()
+        .TypeConstraint("T1",
+                        BuildKernelDefConstraintsFromTypeList<IsInf::EnabledDataTypes20>())
         .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
     IsInf);
 
@@ -46,6 +80,7 @@ IsInf::IsInf(const OpKernelInfo& info) : OpKernel(info) {
   ORT_ENFORCE(status.IsOK(), "Failed to obtain detect_positive");
   status = info.GetAttr("detect_negative", &detect_negative_);
   ORT_ENFORCE(status.IsOK(), "Failed to obtain detect_negative");
+  opset_ = info.node().SinceVersion();
 }
 
 namespace isinf_internal {
@@ -78,6 +113,49 @@ struct ComputeDispatchTarget {
     }
   }
 };
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+template <>
+struct ComputeDispatchTarget<Float8E4M3FN> {
+  void operator()(const Tensor&, Tensor& Y, bool, bool) const {
+    EigenMap<bool>(Y).array() = false;
+  }
+};
+
+template <>
+struct ComputeDispatchTarget<Float8E4M3FNUZ> {
+  void operator()(const Tensor&, Tensor& Y, bool, bool) const {
+    EigenMap<bool>(Y).array() = false;
+  }
+};
+
+template <>
+struct ComputeDispatchTarget<Float8E5M2> {
+  void operator()(const Tensor& X, Tensor& Y, bool detect_positive, bool detect_negative) const {
+    auto& dims = X.Shape();
+    auto input = ConstEigenVectorMap<uint8_t>(static_cast<const uint8_t*>(static_cast<const void*>(X.Data<Float8E5M2>())), onnxruntime::narrow<size_t>(dims.Size()));
+    auto output = EigenMap<bool>(Y);
+
+    // S.11111.00
+    if (detect_positive && detect_negative) {
+      output.array() = input.array() == 0b01111100 || input.array() == 0b11111100;
+    } else if (detect_positive) {
+      output.array() = input.array() == 0b01111100;
+    } else if (detect_negative) {
+      output.array() = input.array() == 0b11111100;
+    } else {
+      output.array() = false;
+    }
+  }
+};
+
+template <>
+struct ComputeDispatchTarget<Float8E5M2FNUZ> {
+  void operator()(const Tensor&, Tensor& Y, bool, bool) const {
+    EigenMap<bool>(Y).array() = false;
+  }
+};
+#endif
 }  // namespace isinf_internal
 
 Status IsInf::Compute(OpKernelContext* context) const {
@@ -88,8 +166,13 @@ Status IsInf::Compute(OpKernelContext* context) const {
 
   using namespace isinf_internal;
 
-  utils::MLTypeCallDispatcherFromTypeList<EnabledDataTypes> dispatcher{X.GetElementType()};
-  dispatcher.Invoke<ComputeDispatchTarget>(X, Y, detect_positive_ != 0, detect_negative_ != 0);
+  if (opset_ < 20) {
+    utils::MLTypeCallDispatcherFromTypeList<EnabledDataTypes10> dispatcher{X.GetElementType()};
+    dispatcher.Invoke<ComputeDispatchTarget>(X, Y, detect_positive_ != 0, detect_negative_ != 0);
+  } else {
+    utils::MLTypeCallDispatcherFromTypeList<EnabledDataTypes20> dispatcher{X.GetElementType()};
+    dispatcher.Invoke<ComputeDispatchTarget>(X, Y, detect_positive_ != 0, detect_negative_ != 0);
+  }
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cpu/tensor/isnan.cc b/onnxruntime/core/providers/cpu/tensor/isnan.cc
index 33d0f8eb6c1ae..34495e382278a 100644
--- a/onnxruntime/core/providers/cpu/tensor/isnan.cc
+++ b/onnxruntime/core/providers/cpu/tensor/isnan.cc
@@ -20,10 +20,20 @@ namespace onnxruntime {
           .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),     \
       IsNaN<data_type>);
 
+#define ADD_TYPED_ISNAN_OP_13(data_type)                                  \
+  ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                               \
+      IsNaN,                                                              \
+      13, 19,                                                             \
+      data_type,                                                          \
+      KernelDefBuilder()                                                  \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<data_type>()) \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),     \
+      IsNaN<data_type>);
+
 #define ADD_TYPED_ISNAN_OP(data_type)                                     \
   ONNX_CPU_OPERATOR_TYPED_KERNEL(                                         \
       IsNaN,                                                              \
-      13,                                                                 \
+      20,                                                                 \
       data_type,                                                          \
       KernelDefBuilder()                                                  \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<data_type>()) \
@@ -33,10 +43,20 @@ namespace onnxruntime {
 ADD_TYPED_ISNAN_OP_9(float);
 ADD_TYPED_ISNAN_OP_9(double);
 ADD_TYPED_ISNAN_OP_9(MLFloat16);
+ADD_TYPED_ISNAN_OP_13(float);
+ADD_TYPED_ISNAN_OP_13(double);
+ADD_TYPED_ISNAN_OP_13(MLFloat16);
 ADD_TYPED_ISNAN_OP(float);
 ADD_TYPED_ISNAN_OP(double);
 ADD_TYPED_ISNAN_OP(MLFloat16);
 
+#if !defined(DISABLE_FLOAT8_TYPES)
+ADD_TYPED_ISNAN_OP(Float8E4M3FN);
+ADD_TYPED_ISNAN_OP(Float8E4M3FNUZ);
+ADD_TYPED_ISNAN_OP(Float8E5M2);
+ADD_TYPED_ISNAN_OP(Float8E5M2FNUZ);
+#endif
+
 template <typename T>
 Status IsNaN<T>::Compute(OpKernelContext* context) const {
   const auto* X_ptr = context->Input<Tensor>(0);
@@ -70,4 +90,63 @@ Status IsNaN<MLFloat16>::Compute(OpKernelContext* context) const {
 
   return Status::OK();
 }
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+template <>
+Status IsNaN<Float8E4M3FN>::Compute(OpKernelContext* context) const {
+  const auto* X = context->Input<Tensor>(0);
+  auto& dims = X->Shape();
+  auto& Y = *context->Output(0, dims);
+
+  auto input = ConstEigenVectorMap<uint8_t>(static_cast<const uint8_t*>(static_cast<const void*>(X->Data<Float8E4M3FN>())), onnxruntime::narrow<size_t>(dims.Size()));
+  auto output = EigenMap<bool>(Y);
+
+  // S.1111.111
+  std::transform(input.begin(), input.end(), output.begin(), [](uint8_t c) { return (c & 0x7f) == 0x7f; });
+  return Status::OK();
+}
+
+template <>
+Status IsNaN<Float8E4M3FNUZ>::Compute(OpKernelContext* context) const {
+  const auto* X = context->Input<Tensor>(0);
+  auto X_data = X->Data<Float8E4M3FNUZ>();
+  auto& dims = X->Shape();
+  auto shape_size = dims.Size();
+  auto& Y = *context->Output(0, dims);
+
+  // 1.0000.000
+  EigenMap<bool>(Y) =
+      ConstEigenVectorMap<uint8_t>(static_cast<const uint8_t*>(static_cast<const void*>(X_data)), onnxruntime::narrow<size_t>(shape_size)).array() == 0x80;
+
+  return Status::OK();
+}
+
+template <>
+Status IsNaN<Float8E5M2>::Compute(OpKernelContext* context) const {
+  const auto* X = context->Input<Tensor>(0);
+  auto& dims = X->Shape();
+  auto& Y = *context->Output(0, dims);
+
+  auto input = ConstEigenVectorMap<uint8_t>(static_cast<const uint8_t*>(static_cast<const void*>(X->Data<Float8E5M2>())), onnxruntime::narrow<size_t>(dims.Size()));
+  auto output = EigenMap<bool>(Y);
+
+  // S.11111.{01, 10, 11}
+  std::transform(input.begin(), input.end(), output.begin(), [](uint8_t c) { return ((c & 0x7c) == 0x7c) && ((c & 0x03) != 0x00); });
+  return Status::OK();
+}
+
+template <>
+Status IsNaN<Float8E5M2FNUZ>::Compute(OpKernelContext* context) const {
+  const auto* X = context->Input<Tensor>(0);
+  auto X_data = X->Data<Float8E5M2FNUZ>();
+  auto& dims = X->Shape();
+  auto shape_size = dims.Size();
+  auto& Y = *context->Output(0, dims);
+
+  // 1.0000.000
+  EigenMap<bool>(Y) = ConstEigenVectorMap<uint8_t>(static_cast<const uint8_t*>(static_cast<const void*>(X_data)), onnxruntime::narrow<size_t>(shape_size)).array() == 0x80;
+
+  return Status::OK();
+}
+#endif
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index e855a515f445a..5f1dbd30f6a3e 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -7,6 +7,25 @@
 
 namespace onnxruntime {
 
+DeferredCpuAllocator::DeferredCpuAllocator(CudaStream& cuda_stream) : cuda_stream_(cuda_stream) {
+  OrtAllocator::version = ORT_API_VERSION;
+  OrtAllocator::Alloc =
+      [](OrtAllocator* this_, size_t size) {
+        auto self = reinterpret_cast<DeferredCpuAllocator*>(this_);
+        return self->cuda_stream_.GetCpuAllocator()->Alloc(size);
+      };
+  OrtAllocator::Free =
+      [](OrtAllocator* this_, void* p) {
+        auto self = reinterpret_cast<DeferredCpuAllocator*>(this_);
+        self->cuda_stream_.EnqueDeferredCPUBuffer(p);
+      };
+  OrtAllocator::Info =
+      [](const OrtAllocator* this_) {
+        auto self = reinterpret_cast<const DeferredCpuAllocator*>(this_);
+        return &self->cuda_stream_.GetCpuAllocator()->Info();
+      };
+}
+
 struct CudaNotification : public synchronize::Notification {
   CudaNotification(Stream& s) : Notification(s) {
     CUDA_CALL_THROW(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
@@ -46,7 +65,8 @@ CudaStream::CudaStream(cudaStream_t stream,
                        cublasHandle_t external_cublas_handle) : Stream(stream, device),
                                                                 own_stream_(own_flag),
                                                                 cpu_allocator_(cpu_allocator),
-                                                                release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream) {
+                                                                release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream),
+                                                                deferred_cpu_allocator_(*this) {
   if (own_flag) {
     CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
     CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
@@ -162,6 +182,9 @@ void* CudaStream::GetResource(int version, int id) const {
     case CudaResource::cublas_handle_t:
       return reinterpret_cast<void*>(cublas_handle_);
       break;
+    case CudaResource::deferred_cpu_allocator_t:
+      return const_cast<DeferredCpuAllocator*>(&deferred_cpu_allocator_);
+      break;
     default:
       break;
   }
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.h b/onnxruntime/core/providers/cuda/cuda_stream_handle.h
index 9c62b029b7a36..917702fae08f1 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.h
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.h
@@ -9,6 +9,13 @@
 
 namespace onnxruntime {
 
+struct CudaStream;
+
+struct DeferredCpuAllocator : public OrtAllocator {
+  DeferredCpuAllocator(CudaStream&);
+  CudaStream& cuda_stream_;
+};
+
 struct CudaStream : Stream {
   CudaStream(cudaStream_t stream,
              const OrtDevice& device,
@@ -36,10 +43,13 @@ struct CudaStream : Stream {
 
   void* GetResource(int version, int id) const override;
 
+  onnxruntime::IAllocator* GetCpuAllocator() const { return cpu_allocator_.get(); }
+
  private:
   std::vector<void*> deferred_cpu_buffers_;
   AllocatorPtr cpu_allocator_;
   bool release_cpu_buffer_on_cuda_stream_{true};
+  DeferredCpuAllocator deferred_cpu_allocator_;
 };
 
 void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index 52018500b134c..cdb0338157561 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -3,6 +3,9 @@
 
 #pragma once
 interface IMLOperatorRegistry;
+interface IDMLDevice;
+interface ID3D12CommandQueue;
+interface ID3D12Resource;
 
 #include "core/common/status.h"
 #include "core/framework/data_transfer.h"
@@ -28,7 +31,8 @@ namespace Dml
     std::unique_ptr<onnxruntime::IExecutionProvider> CreateExecutionProvider(
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* commandQueue,
-        bool enableMetacommands = true);
+        bool enableMetacommands,
+        bool enableDynamicGraphFusion);
 
     ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr);
     void FlushContext(onnxruntime::IExecutionProvider* provider);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index 04381b6ce355c..074f13b309181 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -7,11 +7,14 @@
 #include <functional>
 #include <variant>
 #include <optional>
+#include <wrl/client.h>
 
 #include "core/framework/op_kernel.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlEdgeShapes.h"
 
 struct AbstractOperatorDesc;
 interface IMLOperatorTensor;
+interface IDMLOperator;
 struct DML_INPUT_GRAPH_EDGE_DESC;
 struct DML_OUTPUT_GRAPH_EDGE_DESC;
 struct DML_INTERMEDIATE_GRAPH_EDGE_DESC;
@@ -92,6 +95,8 @@ namespace Windows::AI::MachineLearning::Adapter
         const onnxruntime::Node& node,
         MLOperatorTensorGetter& constantInputGetter,
         const void* executionHandle,
+        const EdgeShapes* inputShapesOverrides,
+        /*out*/ EdgeShapes* outputShapes,
         /*out*/ DmlGraphNodeCreateInfo* graphNodeCreateInfo
         )>;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
index ede3e7f2c2257..eb068087de4ad 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
@@ -491,6 +491,8 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
                     const onnxruntime::Node& node,
                     MLOperatorTensorGetter& constantInputGetter,
                     const void* executionHandle,
+                    const EdgeShapes* inputShapesOverrides,
+                    /*out*/ EdgeShapes* outputShapes,
                     /*out*/ DmlGraphNodeCreateInfo* graphNodeCreateInfo
                 )
                 {
@@ -498,15 +500,15 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
                     onnxruntime::OpNodeProtoHelper<onnxruntime::ProtoHelperNodeContext> protoHelper(&nodeContext);
 
                     // Use the same list of required constant inputs for the shape inferrer and the kernel.
-                    EdgeShapes outputShapes;
-                    InferAndVerifyOutputSizes(node, &defaultAttributesCapture, shapeInferrerCapture.Get(), constantCpuInputCapture, constantInputGetter, nullptr, outputShapes);
+                    InferAndVerifyOutputSizes(node, &defaultAttributesCapture, shapeInferrerCapture.Get(), constantCpuInputCapture, constantInputGetter, inputShapesOverrides, *outputShapes);
 
                     // Create the kernel while allowing input shape and output shape queries according to options
                     ComPtr<DmlGraphOpKernelInfoWrapper> kernelInfoWrapper = wil::MakeOrThrow<DmlGraphOpKernelInfoWrapper>(
                             &protoHelper,
                             executionHandle,
                             true,
-                            &outputShapes,
+                            inputShapesOverrides,
+                            outputShapes,
                             &defaultAttributesCapture,
                             graphNodeCreateInfo,
                             constantCpuInputCapture,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlEdgeShapes.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlEdgeShapes.h
new file mode 100644
index 0000000000000..5ff70493252bd
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlEdgeShapes.h
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace Windows::AI::MachineLearning::Adapter
+{
+    // edges and unused edges have an empty array of dimensions.
+    class EdgeShapes
+    {
+    public:
+        EdgeShapes() = default;
+
+        EdgeShapes(size_t count) : m_shapes(count) {}
+
+        const std::vector<uint32_t>& GetShape(size_t edgeIndex) const
+        {
+            return m_shapes[edgeIndex];
+        }
+
+        std::vector<uint32_t>& GetMutableShape(size_t edgeIndex)
+        {
+            return m_shapes[edgeIndex];
+        }
+
+        size_t EdgeCount() const { return m_shapes.size(); }
+
+        void Reset(size_t edge_count)
+        {
+            m_shapes.clear();
+            m_shapes.resize(edge_count);
+        }
+
+        bool operator!=(const EdgeShapes& other) const noexcept
+        {
+            return (m_shapes != other.m_shapes);
+        }
+
+    private:
+        std::vector<std::vector<uint32_t>> m_shapes;
+    };
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 51b93efb3a646..cd74e7fa92940 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "DmlGraphFusionHelper.h"
-
+#include "DmlRuntimeFusedGraphKernel.h"
 
 namespace Dml
 {
@@ -501,5 +501,171 @@ namespace DmlGraphFusionHelper
 
         graph.FinalizeFuseSubGraph(indexedSubGraph, fusedNode);
     }
+
+    void RegisterDynamicKernel(
+        onnxruntime::Graph& graph,
+        onnxruntime::KernelRegistry* registryForPartitionKernels,
+        const ExecutionProviderImpl* providerImpl,
+        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties> graphNodePropertyMap,
+        const std::unordered_set<std::string>& dynamicCpuInputMap,
+        std::shared_ptr<const onnxruntime::IndexedSubGraph> indexedSubGraph,
+        std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>&& isInitializerTransferable)
+    {
+        struct NodeInfo
+        {
+            std::string name;
+            std::string opType;
+            std::string description;
+            std::string domain;
+            onnxruntime::NodeAttributes attributes;
+            std::vector<onnxruntime::NodeArg*> inputDefPointers;
+            std::vector<onnxruntime::NodeArg*> outputDefPointers;
+        };
+
+        auto partitionNodePropsMap = DmlGraphFusionHelper::CreatePartitionNodePropsMap(
+            graph,
+            *indexedSubGraph,
+            std::move(graphNodePropertyMap));
+
+        auto modelPath = graph.ModelPath();
+
+        const gsl::span<const std::string> subGraphInputArgNames = indexedSubGraph->GetMetaDef()->inputs;
+        const gsl::span<const std::string> subGraphOutputArgNames = indexedSubGraph->GetMetaDef()->outputs;
+
+        std::vector<NodeInfo> nodesInfo;
+        nodesInfo.reserve(indexedSubGraph->nodes.size());
+
+        std::vector<const onnxruntime::NodeArg*> subgraphInputs;
+        subgraphInputs.reserve(subGraphInputArgNames.size());
+
+        std::vector<const onnxruntime::NodeArg*> subgraphOutputs;
+        subgraphOutputs.reserve(subGraphOutputArgNames.size());
+
+        std::vector<onnxruntime::NodeAttributes> nodeAttributes;
+        nodeAttributes.reserve(indexedSubGraph->nodes.size());
+
+        std::vector<std::shared_ptr<onnxruntime::NodeArg>> intermediateNodeArgs;
+
+        for (size_t sortedNodeIndex : indexedSubGraph->nodes)
+        {
+            auto node = graph.GetNode(sortedNodeIndex);
+
+            nodeAttributes.push_back(node->GetAttributes());
+
+            NodeInfo nodeInfo{};
+            nodeInfo.name = node->Name();
+            nodeInfo.opType = node->OpType();
+            nodeInfo.description = node->Description();
+            nodeInfo.domain = node->Domain();
+            nodeInfo.attributes = node->GetAttributes();
+            nodeInfo.inputDefPointers.reserve(node->InputDefs().size());
+            nodeInfo.outputDefPointers.reserve(node->OutputDefs().size());
+
+            for (const onnxruntime::NodeArg* inputDef : node->InputDefs())
+            {
+                intermediateNodeArgs.emplace_back(std::make_shared<onnxruntime::NodeArg>(inputDef->Name(), inputDef->TypeAsProto()));
+                nodeInfo.inputDefPointers.push_back(intermediateNodeArgs.back().get());
+            }
+
+            for (const onnxruntime::NodeArg* outputDef : node->OutputDefs())
+            {
+                intermediateNodeArgs.emplace_back(std::make_shared<onnxruntime::NodeArg>(outputDef->Name(), outputDef->TypeAsProto()));
+                nodeInfo.outputDefPointers.push_back(intermediateNodeArgs.back().get());
+            }
+
+            nodesInfo.push_back(std::move(nodeInfo));
+        }
+
+        for (const std::string& graphInputName : subGraphInputArgNames)
+        {
+            subgraphInputs.push_back(graph.GetNodeArg(graphInputName));
+        }
+
+        for (const std::string& graphOutputName : subGraphOutputArgNames)
+        {
+            subgraphOutputs.push_back(graph.GetNodeArg(graphOutputName));
+        }
+
+        // We need to keep the initializers alive since they will be freed once the nodes are removed from the graph
+        std::vector<ONNX_NAMESPACE::TensorProto> ownedInitializers;
+        ownedInitializers.reserve(isInitializerTransferable.size());
+
+        for (auto& kvp : isInitializerTransferable)
+        {
+            ONNX_NAMESPACE::TensorProto tensorProto;
+            tensorProto.set_data_type(kvp.second.first->data_type());
+            tensorProto.set_raw_data(kvp.second.first->raw_data());
+            tensorProto.set_name(kvp.second.first->name());
+
+            for (int i = 0; i < kvp.second.first->dims_size(); ++i)
+            {
+                tensorProto.add_dims(kvp.second.first->dims(i));
+            }
+            ownedInitializers.push_back(std::move(tensorProto));
+            kvp.second.first = &ownedInitializers.back();
+        }
+
+        // lamda captures for the kernel registration
+        auto fused_kernel_func = [
+            indexedSubGraph,
+            &modelPath,
+            nodesInfo = std::move(nodesInfo),
+            intermediateNodeArgs = std::move(intermediateNodeArgs),
+            subgraphInputs = std::move(subgraphInputs),
+            subgraphOutputs = std::move(subgraphOutputs),
+            partitionNodePropsMap = std::move(partitionNodePropsMap),
+            ownedInitializers = std::move(ownedInitializers)] (onnxruntime::FuncManager& func_mgr, const onnxruntime::OpKernelInfo& info, std::unique_ptr<onnxruntime::OpKernel>& out) mutable ->onnxruntime::Status
+        {
+            std::vector<std::shared_ptr<onnxruntime::Node>> subgraphNodes;
+            subgraphNodes.reserve(nodesInfo.size());
+
+            for (const NodeInfo& nodeInfo : nodesInfo)
+            {
+                subgraphNodes.emplace_back(std::make_shared<onnxruntime::Node>(
+                    nodeInfo.name,
+                    nodeInfo.opType,
+                    nodeInfo.description,
+                    nodeInfo.inputDefPointers,
+                    nodeInfo.outputDefPointers,
+                    &nodeInfo.attributes,
+                    nodeInfo.domain));
+            }
+
+            out.reset(CreateRuntimeFusedGraphKernel(
+                info,
+                indexedSubGraph,
+                modelPath,
+                std::move(subgraphNodes),
+                std::move(subgraphInputs),
+                std::move(subgraphOutputs),
+                std::move(intermediateNodeArgs),
+                std::move(partitionNodePropsMap),
+                std::move(ownedInitializers)));
+            return Status::OK();
+        };
+
+        // build the kernel definition on the fly, and register it to the fused_kernel_regisitry.
+        onnxruntime::KernelDefBuilder builder;
+        builder.SetName(indexedSubGraph->GetMetaDef()->name)
+            .SetDomain(indexedSubGraph->GetMetaDef()->domain)
+            .SinceVersion(indexedSubGraph->GetMetaDef()->since_version)
+            .Provider(onnxruntime::kDmlExecutionProvider);
+
+        // Force the CPU inputs to be allocated on the CPU
+        for (int i = 0; i < subGraphInputArgNames.size(); ++i)
+        {
+            if (dynamicCpuInputMap.find(subGraphInputArgNames[i]) != dynamicCpuInputMap.end())
+            {
+                builder.InputMemoryType(OrtMemTypeCPUInput, i);
+            }
+        }
+
+        ORT_THROW_IF_ERROR(registryForPartitionKernels->Register(builder, fused_kernel_func));
+
+        auto& fusedNode = graph.BeginFuseSubGraph(*indexedSubGraph, indexedSubGraph->GetMetaDef()->name);
+        fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider);
+
+        graph.FinalizeFuseSubGraph(*indexedSubGraph, fusedNode);
+    }
 }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
index 030cffc2a8794..f8f6162aaa1e0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
@@ -80,5 +80,14 @@ namespace DmlGraphFusionHelper
         std::vector<uint8_t>&& isInputsUploadedByDmlEP,
         const GraphDescBuilder::GraphDesc& graphDesc,
         Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator);
+
+    void RegisterDynamicKernel(
+        onnxruntime::Graph& graph,
+        onnxruntime::KernelRegistry* registryForPartitionKernels,
+        const ExecutionProviderImpl* providerImpl,
+        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties> graphNodePropertyMap,
+        const std::unordered_set<std::string>& dynamicCpuInputMap,
+        std::shared_ptr<const onnxruntime::IndexedSubGraph> indexedSubGraph,
+        std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>&& isInitializerTransferable);
 }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
index 4813707cdf50c..679738b639ec9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@@ -15,6 +15,18 @@
 
 namespace Dml
 {
+    namespace
+    {
+        struct CompiledPartitionInfo
+        {
+            Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledOperator;
+            onnxruntime::IndexedSubGraph indexedSubGraph;
+            std::vector<uint8_t> isInputsUploadedByDmlEP;
+            GraphDescBuilder::GraphDesc graphDesc;
+            std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
+        };
+    }
+
     DmlGraphFusionTransformer::DmlGraphFusionTransformer(
         const std::string& name,
         const onnxruntime::IExecutionProvider* provider
@@ -24,15 +36,6 @@ namespace Dml
     {
     }
 
-    struct CompiledPartitionInfo
-    {
-        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledOperator;
-        onnxruntime::IndexedSubGraph indexedSubGraph;
-        std::vector<uint8_t> isInputsUploadedByDmlEP;
-        GraphDescBuilder::GraphDesc graphDesc;
-        std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
-    };
-
     onnxruntime::common::Status DmlGraphFusionTransformer::ApplyImpl(
         onnxruntime::Graph& graph,
         bool& modified,
@@ -87,6 +90,7 @@ namespace Dml
         {
             // Initializers needed by any graph partition
             std::unordered_set<std::string> requiredInitializerMap;
+            std::unordered_set<std::string> dynamicCpuInputMap;
             std::unordered_map<const onnxruntime::Node*, GraphNodeProperties> graphNodePropertyMap;
             onnxruntime::GraphViewer graphViewer(graph);
             std::vector<std::unique_ptr<GraphPartition>> partitions = BuildPartitions(
@@ -96,8 +100,10 @@ namespace Dml
                 m_providerImpl->GetSupportedDeviceDataTypeMask(),
                 graphNodePropertyMap,
                 requiredInitializerMap,
+                dynamicCpuInputMap,
                 additionalSplittingNodes,
-                implicitInputDefs);
+                implicitInputDefs,
+                false);
 
             // Reset the splitting nodes for the current iteration
             additionalSplittingNodes.clear();
@@ -190,17 +196,48 @@ namespace Dml
                         std::move(graphNodePropertyMap));
 
                     // Convert partitionONNXGraph into DML EP GraphDesc
+                    auto modelPath = graph.ModelPath();
+
+                    const gsl::span<const std::string> subGraphInputArgNames = indexedSubGraph.GetMetaDef()->inputs;
+                    const gsl::span<const std::string> subGraphOutputArgNames = indexedSubGraph.GetMetaDef()->outputs;
+
+                    std::vector<const onnxruntime::Node*> subgraphNodes;
+                    subgraphNodes.reserve(indexedSubGraph.nodes.size());
+
+                    std::vector<const onnxruntime::NodeArg*> subgraphInputs;
+                    subgraphInputs.reserve(subGraphInputArgNames.size());
+
+                    std::vector<const onnxruntime::NodeArg*> subgraphOutputs;
+                    subgraphOutputs.reserve(subGraphOutputArgNames.size());
+
+                    for (size_t sortedNodeIndex : indexedSubGraph.nodes)
+                    {
+                        subgraphNodes.push_back(graph.GetNode(sortedNodeIndex));
+                    }
+
+                    for (const std::string& graphInputName : subGraphInputArgNames)
+                    {
+                        subgraphInputs.push_back(graph.GetNodeArg(graphInputName));
+                    }
+
+                    for (const std::string& graphOutputName : subGraphOutputArgNames)
+                    {
+                        subgraphOutputs.push_back(graph.GetNodeArg(graphOutputName));
+                    }
+
                     ComPtr<IDMLDevice> device;
                     ORT_THROW_IF_FAILED(m_providerImpl->GetDmlDevice(device.GetAddressOf()));
                     GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
                         isInputsUploadedByDmlEP.data(),
                         isInputsUploadedByDmlEP.size(),
                         isInitializerTransferable,
-                        graph,
-                        indexedSubGraph,
                         partitionNodePropsMap,
                         device.Get(),
-                        m_providerImpl);
+                        m_providerImpl,
+                        modelPath,
+                        subgraphNodes,
+                        subgraphInputs,
+                        subgraphOutputs);
 
                     // Compile the operator
                     auto compiledPartition = DmlGraphFusionHelper::TryCreateCompiledOperator(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
new file mode 100644
index 0000000000000..1db22ac92e527
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
@@ -0,0 +1,369 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+#include "core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h"
+
+using namespace Windows::AI::MachineLearning::Adapter;
+
+namespace Dml
+{
+    class DmlRuntimeFusedGraphKernel : public onnxruntime::OpKernel
+    {
+    public:
+        DmlRuntimeFusedGraphKernel() = delete;
+
+        DmlRuntimeFusedGraphKernel(
+            const onnxruntime::OpKernelInfo& kernelInfo,
+            std::shared_ptr<const onnxruntime::IndexedSubGraph> indexedSubGraph,
+            const onnxruntime::Path& modelPath,
+            std::vector<std::shared_ptr<onnxruntime::Node>>&& subgraphNodes,
+            std::vector<const onnxruntime::NodeArg*>&& subgraphInputs,
+            std::vector<const onnxruntime::NodeArg*>&& subgraphOutputs,
+            std::vector<std::shared_ptr<onnxruntime::NodeArg>>&& intermediateNodeArgs,
+            std::unordered_map<std::string, GraphNodeProperties>&& partitionNodePropsMap,
+            std::vector<ONNX_NAMESPACE::TensorProto>&& ownedInitializers)
+        : OpKernel(kernelInfo),
+          m_indexedSubGraph(std::move(indexedSubGraph)),
+          m_modelPath(modelPath),
+          m_subgraphNodes(std::move(subgraphNodes)),
+          m_subgraphInputs(std::move(subgraphInputs)),
+          m_subgraphOutputs(std::move(subgraphOutputs)),
+          m_intermediateNodeArgs(std::move(intermediateNodeArgs)),
+          m_partitionNodePropsMap(std::move(partitionNodePropsMap)),
+          m_ownedInitializers(std::move(ownedInitializers))
+        {
+            for (const auto& initializer : m_ownedInitializers)
+            {
+                m_isInitializerTransferable[initializer.name()] = std::make_pair(&initializer, false);
+            }
+
+            // Get the execution provider interfaces
+            auto executionHandle = kernelInfo.GetExecutionProvider()->GetExecutionHandle();
+            if (executionHandle)
+            {
+                // We assume the execution object inherits IUnknown as its first base
+                ComPtr<IUnknown> providerExecutionObject = const_cast<IUnknown*>(static_cast<const IUnknown*>(executionHandle));
+
+                // Get the WinML-specific execution provider interface from the execution object.
+                ORT_THROW_IF_FAILED(providerExecutionObject.As(&m_provider));
+                ORT_THROW_IF_FAILED(providerExecutionObject.As(&m_winmlProvider));
+            }
+
+            m_subgraphNodePointers.reserve(m_subgraphNodes.size());
+
+            for (auto& subgraphNode : m_subgraphNodes)
+            {
+                m_subgraphNodePointers.push_back(subgraphNode.get());
+            }
+        }
+
+        void TranslateAndCompileGraph(
+            const onnxruntime::OpKernelInfo& kernelInfo,
+            std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>>& initializeResourceRefs,
+            std::vector<DML_BUFFER_BINDING> initInputBindings) const
+        {
+            // Allocate a persistent resource and initialize the operator
+            UINT64 persistentResourceSize = m_compiledExecutionPlanOperator->GetBindingProperties().PersistentResourceSize;
+            if (persistentResourceSize > 0)
+            {
+                ORT_THROW_IF_FAILED(m_provider->AllocatePooledResource(
+                    static_cast<size_t>(persistentResourceSize),
+                    AllocatorRoundingMode::Disabled,
+                    m_persistentResource.ReleaseAndGetAddressOf(),
+                    m_persistentResourceAllocatorUnk.ReleaseAndGetAddressOf()));
+
+                m_persistentResourceBinding = DML_BUFFER_BINDING { m_persistentResource.Get(), 0, persistentResourceSize };
+            }
+
+            ORT_THROW_IF_FAILED(m_provider->InitializeOperator(
+                m_compiledExecutionPlanOperator.Get(),
+                m_persistentResourceBinding ? &*m_persistentResourceBinding : nullptr,
+                gsl::make_span(initInputBindings)));
+
+            std::for_each(
+                initializeResourceRefs.begin(),
+                initializeResourceRefs.end(),
+                [&](ComPtr<ID3D12Resource>& resource){ m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(resource).Get()); }
+            );
+        }
+
+        onnxruntime::Status Compute(onnxruntime::OpKernelContext* kernelContext) const override
+        {
+            ORT_THROW_HR_IF(E_UNEXPECTED, m_subgraphInputs.size() != kernelContext->InputCount());
+
+            bool recompileNeeded = m_compiledExecutionPlanOperator == nullptr;
+
+            for (int inputIndex = 0; inputIndex < kernelContext->InputCount(); ++inputIndex)
+            {
+                const auto& input = kernelContext->RequiredInput<onnxruntime::Tensor>(inputIndex);
+                const std::string& inputName = m_subgraphInputs[inputIndex]->Name();
+                auto shapeIter = m_inferredInputShapes.find(inputName);
+
+                if (shapeIter == m_inferredInputShapes.end())
+                {
+                    m_inferredInputShapes[inputName] = input.Shape();
+                    recompileNeeded = true;
+                }
+                else if (shapeIter->second != input.Shape())
+                {
+                    shapeIter->second = input.Shape();
+                    recompileNeeded = true;
+                }
+
+                // If we have CPU inputs that are not initializers (i.e. they were computed at runtime), add them to the initializer list
+                if (input.Location().device.Type() == OrtDevice::CPU)
+                {
+                    auto inputProto = onnxruntime::utils::TensorToTensorProto(input, inputName);
+
+                    // We can only avoid recompiling the graph when all CPU inputs are identical
+                    auto initializerIter = m_isInitializerTransferable.find(inputName);
+
+                    if (initializerIter != m_isInitializerTransferable.end())
+                    {
+                        if (initializerIter->second.first->raw_data().length() == inputProto.raw_data().length())
+                        {
+                            for (int i = 0; i < inputProto.raw_data().length(); ++i)
+                            {
+                                if (initializerIter->second.first->raw_data()[i] != inputProto.raw_data()[i])
+                                {
+                                    recompileNeeded = true;
+                                    break;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            recompileNeeded = true;
+                        }
+                    }
+                    else
+                    {
+                        recompileNeeded = true;
+                    }
+
+                    m_ownedCpuInputs.push_back(std::make_unique<ONNX_NAMESPACE::TensorProto>(std::move(inputProto)));
+                    m_isInitializerTransferable[inputName] = std::make_pair(m_ownedCpuInputs.back().get(), false);
+                }
+            }
+
+            if (recompileNeeded)
+            {
+                // Go through all the node args and replace their shapes with the real ones
+                for (auto& nodeArg : m_intermediateNodeArgs)
+                {
+                    auto iter = m_inferredInputShapes.find(nodeArg->Name());
+                    if (iter != m_inferredInputShapes.end())
+                    {
+                        auto tensorShape = *nodeArg->Shape();
+                        ORT_THROW_HR_IF(E_UNEXPECTED, tensorShape.dim_size() != iter->second.NumDimensions());
+
+                        for (int i = 0; i < tensorShape.dim_size(); ++i)
+                        {
+                            tensorShape.mutable_dim(i)->set_dim_value(iter->second.GetDims()[i]);
+                        }
+
+                        nodeArg->SetShape(tensorShape);
+                    }
+                }
+
+                // Populate input bindings for operator initialization
+                const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(m_indexedSubGraph->GetMetaDef()->inputs.size());
+                std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>> initializeResourceRefs; // For lifetime control
+                std::vector<DML_BUFFER_BINDING> initInputBindings(fusedNodeInputCount);
+                std::vector<uint8_t> isInputsUploadedByDmlEP(fusedNodeInputCount);
+                auto providerImpl = static_cast<const ExecutionProvider*>(Info().GetExecutionProvider())->GetImpl();
+
+                // Convert partitionONNXGraph into DML EP GraphDesc
+                ComPtr<IDMLDevice> device;
+                ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
+                GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
+                    isInputsUploadedByDmlEP.data(),
+                    isInputsUploadedByDmlEP.size(),
+                    m_isInitializerTransferable,
+                    m_partitionNodePropsMap,
+                    device.Get(),
+                    providerImpl,
+                    m_modelPath,
+                    m_subgraphNodePointers,
+                    m_subgraphInputs,
+                    m_subgraphOutputs);
+
+                m_outputShapes = graphDesc.outputShapes;
+
+                // Walk through each graph edge and mark used inputs
+                m_inputsUsed.resize(fusedNodeInputCount, false);
+                for (const DML_INPUT_GRAPH_EDGE_DESC& edge : graphDesc.inputEdges)
+                {
+                    m_inputsUsed[edge.GraphInputIndex] = true;
+                }
+
+                // Compile the operator
+                m_compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator(
+                    graphDesc,
+                    *m_indexedSubGraph,
+                    providerImpl);
+
+                // Queue references to objects which must be kept alive until resulting GPU work completes
+                m_winmlProvider->QueueReference(m_compiledExecutionPlanOperator.Get());
+
+                TranslateAndCompileGraph(
+                    Info(),
+                    initializeResourceRefs,
+                    initInputBindings);
+            }
+
+            // Wrap tensors as required by Dml::IExecutionProvider::ExecuteOperator
+            OpKernelContextWrapper contextWrapper(
+                kernelContext,
+                Info().GetExecutionProvider(),
+                true,
+                nullptr);
+
+            ORT_THROW_IF_FAILED(m_provider->AddUAVBarrier());
+
+            // Get input resources for execution, excluding those which were specified as owned by DML and provided
+            // at initialization instead.
+            std::vector<ComPtr<IMLOperatorTensor>> inputTensors(kernelContext->InputCount());
+            std::vector<ID3D12Resource*> inputPtrs(kernelContext->InputCount());
+
+            for (int i = 0; i < kernelContext->InputCount(); ++i)
+            {
+                if (!m_inputsUsed[i])
+                {
+                    continue;
+                }
+
+                ORT_THROW_IF_FAILED(contextWrapper.GetInputTensor(i, inputTensors[i].GetAddressOf()));
+                inputPtrs[i] = m_provider->DecodeResource(MLOperatorTensor(inputTensors[i].Get()).GetDataInterface().Get());
+            }
+
+            auto outputTensors = contextWrapper.GetOutputTensors(m_outputShapes);
+            ExecuteOperator(
+                m_compiledExecutionPlanOperator.Get(),
+                m_persistentResourceBinding ? &*m_persistentResourceBinding : nullptr,
+                inputPtrs,
+                outputTensors);
+
+            ORT_THROW_IF_FAILED(m_provider->AddUAVBarrier());
+
+            return onnxruntime::Status::OK();
+        }
+
+        void ExecuteOperator(
+            IDMLCompiledOperator* op,
+            _In_opt_ const DML_BUFFER_BINDING* persistentResourceBinding,
+            gsl::span<ID3D12Resource*> inputTensors,
+            gsl::span<IMLOperatorTensor*> outputTensors) const
+        {
+            auto FillBindingsFromTensors = [this](auto& bufferBindings, auto& bindingDescs,  gsl::span<IMLOperatorTensor*>& tensors)
+            {
+                for (IMLOperatorTensor* tensor : tensors)
+                {
+                    if (tensor)
+                    {
+                        assert(tensor->IsDataInterface());
+                        ID3D12Resource* resource = m_provider->DecodeResource(MLOperatorTensor(tensor).GetDataInterface().Get());
+                        D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
+                        bufferBindings.push_back({ resource, 0, resourceDesc.Width });
+                        bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
+                    }
+                    else
+                    {
+                        bufferBindings.push_back({ nullptr, 0, 0 });
+                        bindingDescs.push_back({ DML_BINDING_TYPE_NONE, nullptr });
+                    }
+                }
+            };
+
+            auto FillBindingsFromBuffers = [](auto& bufferBindings, auto& bindingDescs,  gsl::span<ID3D12Resource*>& resources)
+            {
+                for (ID3D12Resource* resource : resources)
+                {
+                    if (resource)
+                    {
+                        D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
+                        bufferBindings.push_back({ resource, 0, resourceDesc.Width });
+                        bindingDescs.push_back({ DML_BINDING_TYPE_BUFFER, &bufferBindings.back() });
+                    }
+                    else
+                    {
+                        bufferBindings.push_back({ nullptr, 0, 0 });
+                        bindingDescs.push_back({ DML_BINDING_TYPE_NONE, nullptr });
+                    }
+                }
+            };
+
+            std::vector<DML_BUFFER_BINDING> inputBufferBindings;
+            inputBufferBindings.reserve(inputTensors.size());
+            std::vector<DML_BINDING_DESC> inputBindings;
+            inputBindings.reserve(inputTensors.size());
+            FillBindingsFromBuffers(inputBufferBindings, inputBindings, inputTensors);
+
+            std::vector<DML_BUFFER_BINDING> outputBufferBindings;
+            outputBufferBindings.reserve(outputTensors.size());
+            std::vector<DML_BINDING_DESC> outputBindings;
+            outputBindings.reserve(outputTensors.size());
+            FillBindingsFromTensors(outputBufferBindings, outputBindings, outputTensors);
+
+            ORT_THROW_IF_FAILED(m_provider->ExecuteOperator(
+                op,
+                persistentResourceBinding,
+                inputBindings,
+                outputBindings));
+        }
+
+    private:
+        ComPtr<IWinmlExecutionProvider> m_winmlProvider;
+        ComPtr<Dml::IExecutionProvider> m_provider;
+
+        mutable std::optional<DML_BUFFER_BINDING> m_persistentResourceBinding;
+        std::shared_ptr<const onnxruntime::IndexedSubGraph> m_indexedSubGraph;
+        const onnxruntime::Path& m_modelPath;
+
+        std::vector<std::shared_ptr<onnxruntime::Node>> m_subgraphNodes;
+        std::vector<const onnxruntime::NodeArg*> m_subgraphInputs;
+        std::vector<const onnxruntime::NodeArg*> m_subgraphOutputs;
+        mutable std::vector<std::shared_ptr<onnxruntime::NodeArg>> m_intermediateNodeArgs;
+        std::unordered_map<std::string, GraphNodeProperties> m_partitionNodePropsMap;
+        std::vector<ONNX_NAMESPACE::TensorProto> m_ownedInitializers;
+        mutable std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> m_isInitializerTransferable;
+        std::vector<const onnxruntime::Node*> m_subgraphNodePointers;
+
+        // Bindings from previous executions of a re-used command list
+        mutable std::vector<std::unique_ptr<ONNX_NAMESPACE::TensorProto>> m_ownedCpuInputs;
+        mutable ComPtr<IDMLCompiledOperator> m_compiledExecutionPlanOperator;
+        mutable std::vector<bool> m_inputsUsed;
+        mutable ComPtr<ID3D12Resource> m_persistentResource;
+        mutable ComPtr<IUnknown> m_persistentResourceAllocatorUnk; // Controls when the persistent resource is returned to the allocator
+        mutable Windows::AI::MachineLearning::Adapter::EdgeShapes m_outputShapes;
+        mutable std::unordered_map<std::string, onnxruntime::TensorShape> m_inferredInputShapes;
+    };
+
+    onnxruntime::OpKernel* CreateRuntimeFusedGraphKernel(
+        const onnxruntime::OpKernelInfo& info,
+        std::shared_ptr<const onnxruntime::IndexedSubGraph> indexedSubGraph,
+        const onnxruntime::Path& modelPath,
+        std::vector<std::shared_ptr<onnxruntime::Node>>&& subgraphNodes,
+        std::vector<const onnxruntime::NodeArg*>&& subgraphInputs,
+        std::vector<const onnxruntime::NodeArg*>&& subgraphOutputs,
+        std::vector<std::shared_ptr<onnxruntime::NodeArg>>&& intermediateNodeArgs,
+        std::unordered_map<std::string, GraphNodeProperties>&& partitionNodePropsMap,
+        std::vector<ONNX_NAMESPACE::TensorProto>&& ownedInitializers)
+    {
+        return new DmlRuntimeFusedGraphKernel(
+            info,
+            std::move(indexedSubGraph),
+            modelPath,
+            std::move(subgraphNodes),
+            std::move(subgraphInputs),
+            std::move(subgraphOutputs),
+            std::move(intermediateNodeArgs),
+            std::move(partitionNodePropsMap),
+            std::move(ownedInitializers)
+        );
+    }
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.h
new file mode 100644
index 0000000000000..d679c5aa5667c
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/op_kernel.h"
+#include "GraphDescBuilder.h"
+#include "DmlRuntimeGraphFusionTransformer.h"
+
+namespace Dml
+{
+    onnxruntime::OpKernel* CreateRuntimeFusedGraphKernel(
+        const onnxruntime::OpKernelInfo& info,
+        std::shared_ptr<const onnxruntime::IndexedSubGraph> indexedSubGraph,
+        const onnxruntime::Path& modelPath,
+        std::vector<std::shared_ptr<onnxruntime::Node>>&& subgraphNodes,
+        std::vector<const onnxruntime::NodeArg*>&& subgraphInputs,
+        std::vector<const onnxruntime::NodeArg*>&& subgraphOutputs,
+        std::vector<std::shared_ptr<onnxruntime::NodeArg>>&& intermediateNodeArgs,
+        std::unordered_map<std::string, GraphNodeProperties>&& partitionNodePropsMap,
+        std::vector<ONNX_NAMESPACE::TensorProto>&& ownedInitializers
+    );
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.cpp
new file mode 100644
index 0000000000000..6318b0d5e2865
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.cpp
@@ -0,0 +1,161 @@
+#pragma once
+
+#include "precomp.h"
+#include "GraphDescBuilder.h"
+#include "ExecutionProvider.h"
+#include "DmlRuntimeGraphFusionTransformer.h"
+#include "GraphPartitioner.h"
+#include "core/framework/kernel_type_str_resolver.h"
+#include "core/framework/kernel_lookup.h"
+#include "core/optimizer/constant_sharing.h"
+#include "DmlRuntimeFusedGraphKernel.h"
+#include "MLOperatorAuthorImpl.h"
+#include "DmlGraphFusionHelper.h"
+
+namespace Dml
+{
+    namespace
+    {
+        struct CompiledPartitionInfo
+        {
+            std::shared_ptr<onnxruntime::IndexedSubGraph> indexedSubGraph;
+            std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
+        };
+    }
+
+    DmlRuntimeGraphFusionTransformer::DmlRuntimeGraphFusionTransformer(
+        const std::string& name,
+        const onnxruntime::IExecutionProvider* provider
+    )
+        :onnxruntime::GraphTransformer(name),
+         m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl())
+    {
+    }
+
+    onnxruntime::common::Status DmlRuntimeGraphFusionTransformer::ApplyImpl(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graphLevel,
+        const onnxruntime::logging::Logger& logger) const
+    {
+        return ApplyImplHelper(graph, modified, graphLevel, logger, {});
+    }
+
+    onnxruntime::common::Status DmlRuntimeGraphFusionTransformer::ApplyImplHelper(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graphLevel,
+        const onnxruntime::logging::Logger& logger,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputDefs) const
+    {
+        onnxruntime::ProviderType providerType = onnxruntime::kDmlExecutionProvider;
+        const gsl::not_null<const onnxruntime::KernelRegistry*> registry = m_providerImpl->GetKernelRegistry().get();
+        const auto kernelTypeStrResolver = onnxruntime::OpSchemaKernelTypeStrResolver{};
+        const auto kernelLookup = onnxruntime::KernelLookup(
+            providerType,
+            gsl::make_span(&registry, 1),
+            kernelTypeStrResolver);
+
+        onnxruntime::GraphViewer graphViewer(graph);
+        const auto& nodeTopologyList = graphViewer.GetNodesInTopologicalOrder();
+
+        for (auto nodeIndex : nodeTopologyList)
+        {
+            auto* node = graph.GetNode(nodeIndex);
+            if (!node)
+            {
+                continue;  // node was removed
+            }
+
+            std::unordered_map<std::string, const onnxruntime::NodeArg*> subgraphImplicitInputDefs;
+            for (const onnxruntime::NodeArg* inputDef : node->ImplicitInputDefs())
+            {
+                subgraphImplicitInputDefs[inputDef->Name()] = inputDef;
+            }
+
+            for (auto& entry : node->GetAttributeNameToMutableSubgraphMap())
+            {
+                auto& subgraph = *entry.second;
+                ORT_RETURN_IF_ERROR(ApplyImplHelper(subgraph, modified, graphLevel + 1, logger, subgraphImplicitInputDefs));
+            }
+        }
+
+        // Initializers needed by any graph partition
+        std::vector<onnxruntime::NodeIndex> additionalSplittingNodes;
+        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties> graphNodePropertyMap;
+        std::unordered_set<std::string> requiredInitializerMap;
+        std::unordered_set<std::string> dynamicCpuInputMap;
+        std::vector<std::unique_ptr<GraphPartition>> partitions = BuildPartitions(
+            graphViewer,
+            *m_providerImpl->GetInternalRegistrationInfoMap(),
+            kernelLookup,
+            m_providerImpl->GetSupportedDeviceDataTypeMask(),
+            graphNodePropertyMap,
+            requiredInitializerMap,
+            dynamicCpuInputMap,
+            additionalSplittingNodes,
+            implicitInputDefs,
+            true);
+
+        // Reset the splitting nodes for the current iteration
+        additionalSplittingNodes.clear();
+
+        // Reset the compiled operators for the current iteration
+        std::vector<std::shared_ptr<CompiledPartitionInfo>> compiledPartitionInfos(partitions.size());
+
+        // Create a map between each initialized tensor and the partition(s) it is part of.
+        auto initializerPartitionMap = DmlGraphFusionHelper::GetInitializerToPartitionMap(graphViewer, partitions);
+
+        for (uint32_t partitionIndex = 0; partitionIndex < partitions.size(); ++partitionIndex)
+        {
+            auto& partition = partitions[partitionIndex];
+
+            if (partition->GetRootMergedPartition() != partition.get() ||
+                !partition->IsDmlPartition())
+            {
+                continue;
+            }
+
+            if (partition->IsDmlGraphPartition())
+            {
+                std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
+
+                std::string partitionKernelPrefix = std::to_string(m_providerImpl->GetPartitionKernelPrefixVal()) + "_";
+                m_providerImpl->IncreasePartitionKernelPrefixVal();
+
+                // populate isInitializerTransferable
+                for (const auto& input : partition->GetInputs())
+                {
+                    const onnx::TensorProto* tensor = nullptr;
+                    if (graph.GetInitializedTensor(input, tensor) && requiredInitializerMap.find(input) != requiredInitializerMap.end())
+                    {
+                        isInitializerTransferable[input] = {tensor, false};
+                    }
+                }
+
+                compiledPartitionInfos[partitionIndex] = std::make_shared<CompiledPartitionInfo>();
+                compiledPartitionInfos[partitionIndex]->indexedSubGraph = std::make_shared<onnxruntime::IndexedSubGraph>(
+                    DmlGraphFusionHelper::CreateIndexedSubGraph(partition.get(), partitionIndex, partitionKernelPrefix));
+                compiledPartitionInfos[partitionIndex]->isInitializerTransferable = std::move(isInitializerTransferable);
+            }
+        }
+
+        for (auto&& compiledPartitionInfo : compiledPartitionInfos)
+        {
+            // Null compiled operators were not DML partitions
+            if (compiledPartitionInfo)
+            {
+                DmlGraphFusionHelper::RegisterDynamicKernel(
+                    graph,
+                    m_providerImpl->GetKernelRegistry().get(),
+                    m_providerImpl,
+                    graphNodePropertyMap,
+                    dynamicCpuInputMap,
+                    std::move(compiledPartitionInfo->indexedSubGraph),
+                    std::move(compiledPartitionInfo->isInitializerTransferable));
+            }
+        }
+
+        return onnxruntime::common::Status::OK();
+    }
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.h
new file mode 100644
index 0000000000000..cfa743e1f2b85
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.h
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include "core/optimizer/graph_transformer.h"
+#include "core/framework/execution_providers.h"
+
+namespace Dml
+{
+class ExecutionProviderImpl;
+
+class DmlRuntimeGraphFusionTransformer : public onnxruntime::GraphTransformer
+{
+public:
+    DmlRuntimeGraphFusionTransformer(
+        const std::string& name,
+        const onnxruntime::IExecutionProvider* provider
+    );
+
+public:
+    static inline const char* const DML_GRAPH_FUSION_NODE_NAME_PREFIX = "DmlRuntimeFusedNode_";
+    static inline const char* const DML_GRAPH_FUSION_NODE_DOMAIN = "DmlRuntimeFusedNodeDomain";
+
+private:
+    onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph,
+                                          bool& modified,
+                                          int graphLevel,
+                                          const onnxruntime::logging::Logger& logger) const final;
+
+    onnxruntime::common::Status ApplyImplHelper(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graphLevel,
+        const onnxruntime::logging::Logger& logger,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputDefs) const;
+
+private:
+    const ExecutionProviderImpl* m_providerImpl = nullptr;
+};
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 5f6bd178aaa15..8644b8d56a426 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -67,7 +67,8 @@ namespace Dml
     ExecutionProvider::ExecutionProvider(
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* commandQueue,
-        bool enableMetacommands) :
+        bool enableMetacommands,
+        bool enableDynamicGraphFusion) :
             IExecutionProvider(onnxruntime::kDmlExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0))
     {
         D3D12_COMMAND_LIST_TYPE queueType = commandQueue->GetDesc().Type;
@@ -80,7 +81,7 @@ namespace Dml
         ComPtr<ID3D12Device> device;
         GRAPHICS_THROW_IF_FAILED(commandQueue->GetDevice(IID_GRAPHICS_PPV_ARGS(device.GetAddressOf())));
 
-        m_impl = wil::MakeOrThrow<ExecutionProviderImpl>(dmlDevice, device.Get(), commandQueue, enableMetacommands);
+        m_impl = wil::MakeOrThrow<ExecutionProviderImpl>(dmlDevice, device.Get(), commandQueue, enableMetacommands, enableDynamicGraphFusion);
     }
 
     std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
@@ -147,12 +148,12 @@ namespace Dml
 // Task 24384515: Update ORT AIInfra release agent pool to install 19H1 SDK on VM bootstrap
 #define D3D_FEATURE_LEVEL_1_0_CORE_PRIVATE ((D3D_FEATURE_LEVEL)0x1000)
 
-    ExecutionProviderImpl::ExecutionProviderImpl(IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, bool enableMetacommands)
+    ExecutionProviderImpl::ExecutionProviderImpl(IDMLDevice* dmlDevice, ID3D12Device* d3d12Device, ID3D12CommandQueue* queue, bool enableMetacommands, bool enableDynamicGraphFusion)
         : m_d3d12Device(d3d12Device),
           m_dmlDevice(dmlDevice),
-          m_areMetacommandsEnabled(enableMetacommands)
+          m_areMetacommandsEnabled(enableMetacommands),
+          m_dynamicGraphFusionEnabled(enableDynamicGraphFusion)
     {
-
         D3D12_FEATURE_DATA_FEATURE_LEVELS featureLevels = {};
 
         D3D_FEATURE_LEVEL featureLevelsList[] = {
@@ -1093,6 +1094,11 @@ namespace Dml
         return m_areMetacommandsEnabled;
     }
 
+    bool ExecutionProviderImpl::DynamicGraphFusionEnabled() const noexcept
+    {
+        return m_dynamicGraphFusionEnabled;
+    }
+
     std::shared_ptr<const Windows::AI::MachineLearning::Adapter::InternalRegistrationInfoMap>
     ExecutionProviderImpl::GetInternalRegistrationInfoMap() const
     {
@@ -1129,9 +1135,10 @@ namespace Dml
     std::unique_ptr<onnxruntime::IExecutionProvider> CreateExecutionProvider(
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* commandQueue,
-        bool enableMetacommands)
+        bool enableMetacommands,
+        bool enableDynamicGraphFusion)
     {
-        return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands);
+        return std::make_unique<Dml::ExecutionProvider>(dmlDevice, commandQueue, enableMetacommands, enableDynamicGraphFusion);
     }
 
     ID3D12Resource* GetD3D12ResourceFromAllocation(onnxruntime::IAllocator* allocator, void* ptr)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 31b893a2f25d7..3aaa11cdee479 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -5,6 +5,7 @@
 
 #include "GraphTransformer.h"
 #include "core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h"
+#include "core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h"
 
 #include <wrl/client.h>
 #include <wrl/implements.h>
@@ -34,7 +35,8 @@ namespace Dml
             IDMLDevice* dmlDevice,
             ID3D12Device* d3d12Device,
             ID3D12CommandQueue* queue,
-            bool enableMetacommands = true);
+            bool enableMetacommands,
+            bool enableDynamicGraphFusion);
 
         void ReleaseCompletedReferences();
 
@@ -150,6 +152,7 @@ namespace Dml
         STDMETHOD_(bool, IsMcdmDevice)() const noexcept final;
 
         STDMETHOD_(bool, MetacommandsEnabled)() const noexcept final;
+        bool DynamicGraphFusionEnabled() const noexcept;
         std::shared_ptr<onnxruntime::IAllocator> GetGpuAllocator();
         std::shared_ptr<onnxruntime::IAllocator> GetCpuInputAllocator();
 
@@ -184,6 +187,7 @@ namespace Dml
         ComPtr<IDMLDevice> m_dmlDevice;
         bool m_isMcdmDevice = false;
         bool m_areMetacommandsEnabled = true;
+        bool m_dynamicGraphFusionEnabled = false;
         bool m_native16BitShaderOpsSupported = false;
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<PooledUploadHeap> m_uploadHeap;
@@ -236,7 +240,8 @@ namespace Dml
         explicit ExecutionProvider(
             IDMLDevice* dmlDevice,
             ID3D12CommandQueue* commandQueue,
-            bool enableMetacommands = true
+            bool enableMetacommands,
+            bool enableDynamicGraphFusion
         );
 
         std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const final override
@@ -299,9 +304,9 @@ namespace Dml
             return m_impl.Get();
         }
 
-        void MetacommandsEnabled()
+        bool DynamicGraphFusionEnabled() const
         {
-            m_impl->MetacommandsEnabled();
+            return m_impl->DynamicGraphFusionEnabled();
         }
 
         virtual std::vector<onnxruntime::AllocatorPtr> CreatePreferredAllocators() override
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index 636f46428ce99..3fc8f415e5a58 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -147,14 +147,14 @@ namespace Dml::GraphDescBuilder
         const uint8_t* isConstGpuGraphInput,
         const size_t isConstGpuGraphInputCount,
         const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
-        const onnxruntime::Graph& graph,
-        const onnxruntime::IndexedSubGraph& indexedSubGraph,
         const std::unordered_map<std::string, GraphNodeProperties>& graphNodePropertyMap,
         IDMLDevice* device,
-        const void* executionHandle)
+        const void* executionHandle,
+        const onnxruntime::Path& modelPath,
+        gsl::span<const onnxruntime::Node* const> subgraphNodes,
+        gsl::span<const onnxruntime::NodeArg* const> subgraphInputs,
+        gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs)
     {
-        const gsl::span<const std::string> subGraphInputArgNames = indexedSubGraph.GetMetaDef()->inputs;
-        const gsl::span<const std::string> subGraphOutputArgNames = indexedSubGraph.GetMetaDef()->outputs;
         struct NodeAndIndex
         {
             uint32_t nodeIndex; // The index of the node itself
@@ -164,12 +164,14 @@ namespace Dml::GraphDescBuilder
         // Map from Lotus node argument names to the new node and index where it will be produced
         std::unordered_map<std::string, NodeAndIndex> nameToNodeAndIndexMap;
 
+        std::unordered_map<std::string, EdgeShapes> nodeOutputShapes;
+
         // Map from Lotus node argument names to input indices of the fused kernel node.
         std::unordered_map<std::string, uint32_t> nameToDmlFusedNodeInputIndex;
 
-        for (size_t inputIndex = 0; inputIndex < subGraphInputArgNames.size(); ++inputIndex)
+        for (size_t inputIndex = 0; inputIndex < subgraphInputs.size(); ++inputIndex)
         {
-            const onnxruntime::NodeArg* graphInput = graph.GetNodeArg(subGraphInputArgNames[inputIndex]);
+            const onnxruntime::NodeArg* graphInput = subgraphInputs[inputIndex];
 
             if (!graphInput)
             {
@@ -196,13 +198,11 @@ namespace Dml::GraphDescBuilder
         const uint32_t minNodeCountToReuseCommandList = 5;
         bool reuseCommandList = false;
 
-        if (indexedSubGraph.nodes.size() >= minNodeCountToReuseCommandList)
+        if (subgraphNodes.size() >= minNodeCountToReuseCommandList)
         {
             reuseCommandList = true;
         }
 
-        auto modelPath = graph.ModelPath();
-
         auto constantCpuGraphInputGetter = [&isInitializerTransferable, &modelPath](const std::string& argName)
         {
             ComPtr<OnnxTensorWrapper> tensorWrapper;
@@ -219,9 +219,11 @@ namespace Dml::GraphDescBuilder
 
         // Iterate through each node and create a corresponding node in the new graph
         // We can iterate the nodes in any order because the edge connectivity will take care of the topological order
-        for (size_t sortedNodeIndex : indexedSubGraph.nodes)
+        std::unordered_map<std::string, std::vector<uint32_t>> inferredOutputShapes;
+
+        for (const onnxruntime::Node* subgraphNode : subgraphNodes)
         {
-            const onnxruntime::Node& node = *graph.GetNode(sortedNodeIndex);
+            const onnxruntime::Node& node = *subgraphNode;
 
             const GraphNodeProperties& graphNodeProps = graphNodePropertyMap.find(GetUniqueNodeName(node))->second;
             const auto& requiredConstantCpuInputs = graphNodeProps.internalRegInfo->requiredConstantCpuInputs;
@@ -244,14 +246,45 @@ namespace Dml::GraphDescBuilder
                 return tensor;
             };
 
+            EdgeShapes inputShapesOverrides(node.InputDefs().size());
+
+            // Override the input shapes with shapes that were previously inferred
+            for (int inputIndex = 0; inputIndex < node.InputDefs().size(); ++inputIndex)
+            {
+                auto inputDef = node.InputDefs()[inputIndex];
+
+                auto outputShapesIter = inferredOutputShapes.find(inputDef->Name());
+                if (outputShapesIter != inferredOutputShapes.end())
+                {
+                    inputShapesOverrides.GetMutableShape(inputIndex) = outputShapesIter->second;
+                }
+                else if (inputDef->HasTensorOrScalarShape())
+                {
+                    for (int i = 0; i < inputDef->Shape()->dim_size(); ++i)
+                    {
+                        ORT_THROW_HR_IF(E_INVALIDARG, !inputDef->Shape()->dim(i).has_dim_value());
+                        inputShapesOverrides.GetMutableShape(inputIndex).push_back(gsl::narrow_cast<uint32_t>(inputDef->Shape()->dim(i).dim_value()));
+                    }
+                }
+            }
+
+            EdgeShapes outputShapes;
             DmlGraphNodeCreateInfo graphNodeCreateInfo;
             graphNodeProps.internalRegInfo->graphNodeFactoryRegistration->factory(
                 node,
                 constantCpuNodeInputGetter,
                 executionHandle,
+                &inputShapesOverrides,
+                /*out*/ &outputShapes,
                 /*out*/ &graphNodeCreateInfo
             );
 
+            ORT_THROW_HR_IF(E_UNEXPECTED, outputShapes.EdgeCount() != node.OutputDefs().size());
+            for (int i = 0; i < node.OutputDefs().size(); ++i)
+            {
+                inferredOutputShapes[node.OutputDefs()[i]->Name()] = outputShapes.GetShape(i);
+            }
+
             // Create a map between operatorGraphNodeIndex to mainGraphNodeIndex.
             std::unordered_map<uint32_t, uint32_t> operatorGraphNodeIndexToMainGraphNodeIndexMap;
             uint32_t graphNodeCount = gsl::narrow_cast<uint32_t>(graphNodes.size());
@@ -347,6 +380,8 @@ namespace Dml::GraphDescBuilder
                         operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphOutputEdge.FromNodeIndex],
                         operatorGraphOutputEdge.FromNodeOutputIndex
                     };
+
+                    nodeOutputShapes[arg->Name()] = outputShapes;
                 }
             }
 
@@ -367,10 +402,12 @@ namespace Dml::GraphDescBuilder
             }
         }
 
+        EdgeShapes graphOutputShapes(subgraphOutputs.size());
+
         // Add graph output nodes, which might be in a different order from the encapsulating node
-        for (size_t outputIndex = 0; outputIndex < subGraphOutputArgNames.size(); ++outputIndex)
+        for (size_t outputIndex = 0; outputIndex < subgraphOutputs.size(); ++outputIndex)
         {
-            const onnxruntime::NodeArg* graphOutput = graph.GetNodeArg(subGraphOutputArgNames[outputIndex]);
+            const onnxruntime::NodeArg* graphOutput = subgraphOutputs[outputIndex];
 
             ORT_THROW_HR_IF_NULL_MSG(E_POINTER, graphOutput, "FusedNode's nodeArgList does not contain one of the nodeArg");
             const auto& outputNodeAndIndex = nameToNodeAndIndexMap.at(graphOutput->Name());
@@ -380,6 +417,7 @@ namespace Dml::GraphDescBuilder
             edge.FromNodeOutputIndex = outputNodeAndIndex.targetIndex;
             edge.GraphOutputIndex = gsl::narrow_cast<uint32_t>(outputIndex);
             graphOutputEdges.push_back(edge);
+            graphOutputShapes.GetMutableShape(outputIndex) = nodeOutputShapes[graphOutput->Name()].GetShape(outputNodeAndIndex.targetIndex);
         }
 
         RemoveUnconnectedNodes(graphNodes, graphInputEdges, graphIntermediateEdges, graphOutputEdges);
@@ -390,6 +428,7 @@ namespace Dml::GraphDescBuilder
         graphDesc.outputEdges = std::move(graphOutputEdges);
         graphDesc.intermediateEdges = std::move(graphIntermediateEdges);
         graphDesc.reuseCommandList = reuseCommandList;
+        graphDesc.outputShapes = std::move(graphOutputShapes);
         return graphDesc;
     }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
index 5c04962e55557..0039678c00e59 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
@@ -9,10 +9,10 @@ namespace Dml
 {
     struct GraphNodeProperties
     {
-        std::shared_ptr<const Windows::AI::MachineLearning::Adapter::InternalRegistrationInfo> 
+        std::shared_ptr<const Windows::AI::MachineLearning::Adapter::InternalRegistrationInfo>
             internalRegInfo;
 
-        // These are currently passed from the partitioning step since the only DML operators current 
+        // These are currently passed from the partitioning step since the only DML operators current
         // supporting graph nodes don't customize the order of edges or shapes, other than coercing
         // dimension count.  This will change as the supported set of operators as graph nodes increases.
         Windows::AI::MachineLearning::Adapter::EdgeShapes inputShapes;
@@ -38,16 +38,19 @@ namespace Dml
             std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
             std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
             bool reuseCommandList;
+            Windows::AI::MachineLearning::Adapter::EdgeShapes outputShapes;
         };
 
         GraphDesc BuildGraphDesc(
             const uint8_t* isConstGpuGraphInput,
             const size_t isConstGpuGraphInputCount,
             const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
-            const onnxruntime::Graph& graph,
-            const onnxruntime::IndexedSubGraph& indexedSubGraph,
             const std::unordered_map<std::string, GraphNodeProperties>& graphNodePropertyMap,
             IDMLDevice* device,
-            const void* executionHandle);
+            const void* executionHandle,
+            const onnxruntime::Path& modelPath,
+            gsl::span<const onnxruntime::Node* const> subgraphNodes,
+            gsl::span<const onnxruntime::NodeArg* const> subgraphInputs,
+            gsl::span<const onnxruntime::NodeArg* const> subgraphOutputs);
     }
-}
\ No newline at end of file
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
index 18943878ccedc..f7a4743801d81 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
@@ -151,6 +151,8 @@ namespace Dml
         _In_opt_ const std::unordered_map<std::string, GraphPartition*>* nodeNameToPartitionMap,
         _Inout_ std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& dmlNodePropertyMap,
         _Inout_ std::unordered_set<std::string>& requiredInitializerMap,
+        _Inout_ std::unordered_set<std::string>& dynamicCpuInputMap,
+        bool allowDmlGraphDynamicShapes,
         _Out_ bool* isDmlGraphNode
         )
     {
@@ -172,36 +174,68 @@ namespace Dml
 
             if (internalRegInfo && internalRegInfo->graphNodeFactoryRegistration)
             {
-                bool requiredCpuInputsConstant = true;
-                for (uint32_t inputIndex : internalRegInfo->requiredConstantCpuInputs)
+                if (allowDmlGraphDynamicShapes)
                 {
-                    if (inputIndex >= node.InputDefs().size() || !node.InputDefs()[inputIndex]->Exists())
+                    for (uint32_t inputIndex : internalRegInfo->requiredConstantCpuInputs)
                     {
-                        continue;
-                    }
+                        if (inputIndex >= node.InputDefs().size() || !node.InputDefs()[inputIndex]->Exists())
+                        {
+                            continue;
+                        }
 
-                    const onnx::TensorProto* tensor = nullptr;
-                    const std::string& inputName = node.InputDefs()[inputIndex]->Name();
+                        const onnx::TensorProto* tensor = nullptr;
+                        const std::string& inputName = node.InputDefs()[inputIndex]->Name();
 
-                    if (!graph.GetInitializedTensor(inputName, tensor))
-                    {
-                        requiredCpuInputsConstant = false;
-                        break;
+                        if (graph.GetInitializedTensor(inputName, tensor))
+                        {
+                            requiredInitializerMap.insert(inputName);
+                        }
+                        else
+                        {
+                            dynamicCpuInputMap.insert(inputName);
+                        }
                     }
 
-                    requiredInitializerMap.insert(inputName);
+                    std::optional<uint32_t> requiredInputCount = internalRegInfo->graphNodeFactoryRegistration->requiredInputCount;
+                    if (requiredInputCount == std::nullopt || *requiredInputCount == node.InputDefs().size())
+                    {
+                        *isDmlGraphNode = true;
+                        graphNodeProperty.first->second.internalRegInfo = internalRegInfo;
+                    }
                 }
-
-                std::optional<uint32_t> requiredInputCount = internalRegInfo->graphNodeFactoryRegistration->requiredInputCount;
-                if (requiredCpuInputsConstant &&
-                    TryGetStaticInputShapes( node, graphNodeProperty.first->second.inputShapes) &&
-                    !ContainsEmptyDimensions(graphNodeProperty.first->second.inputShapes, internalRegInfo->requiredConstantCpuInputs) &&
-                    TryGetStaticOutputShapes(node, graphNodeProperty.first->second.outputShapes) &&
-                    !ContainsEmptyDimensions(graphNodeProperty.first->second.outputShapes, internalRegInfo->requiredConstantCpuInputs) &&
-                    (requiredInputCount == std::nullopt || *requiredInputCount == node.InputDefs().size()))
+                else
                 {
-                    *isDmlGraphNode = true;
-                    graphNodeProperty.first->second.internalRegInfo = internalRegInfo;
+                    bool requiredCpuInputsConstant = true;
+                    for (uint32_t inputIndex : internalRegInfo->requiredConstantCpuInputs)
+                    {
+                        if (inputIndex >= node.InputDefs().size() || !node.InputDefs()[inputIndex]->Exists())
+                        {
+                            continue;
+                        }
+
+                        const onnx::TensorProto* tensor = nullptr;
+                        const std::string& inputName = node.InputDefs()[inputIndex]->Name();
+
+                        if (!graph.GetInitializedTensor(inputName, tensor))
+                        {
+                            requiredCpuInputsConstant = false;
+                            break;
+                        }
+
+                        requiredInitializerMap.insert(inputName);
+                    }
+
+                    std::optional<uint32_t> requiredInputCount = internalRegInfo->graphNodeFactoryRegistration->requiredInputCount;
+                    if (requiredCpuInputsConstant &&
+                        TryGetStaticInputShapes( node, graphNodeProperty.first->second.inputShapes) &&
+                        !ContainsEmptyDimensions(graphNodeProperty.first->second.inputShapes, internalRegInfo->requiredConstantCpuInputs) &&
+                        TryGetStaticOutputShapes(node, graphNodeProperty.first->second.outputShapes) &&
+                        !ContainsEmptyDimensions(graphNodeProperty.first->second.outputShapes, internalRegInfo->requiredConstantCpuInputs) &&
+                        (requiredInputCount == std::nullopt || *requiredInputCount == node.InputDefs().size()))
+                    {
+                        *isDmlGraphNode = true;
+                        graphNodeProperty.first->second.internalRegInfo = internalRegInfo;
+                    }
                 }
             }
         }
@@ -379,8 +413,10 @@ namespace Dml
         uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
         std::unordered_set<std::string>& requiredInitializerMap,
+        std::unordered_set<std::string>& dynamicCpuInputMap,
         gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes,
-        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs)
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs,
+        bool allowDmlGraphDynamicShapes)
     {
         // Nodes are uniquely identified by the name of their first output argument
         std::vector<std::unique_ptr<GraphPartition>> partitions;
@@ -443,6 +479,8 @@ namespace Dml
                     &nodeNameToPartitionMap,
                     graphNodePropertyMap,
                     requiredInitializerMap,
+                    dynamicCpuInputMap,
+                    allowDmlGraphDynamicShapes,
                     /*out*/ &isDmlGraphNode
                 );
             }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
index 37d577f647fb5..3bddb5ae16086 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
@@ -50,6 +50,8 @@ namespace Dml
         uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
         std::unordered_set<std::string>& requiredInitializerMap,
+        std::unordered_set<std::string>& dynamicCpuInputMap,
         gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes,
-        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs);
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs,
+        bool allowDmlGraphDynamicShapes);
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
index d7a0a607cdec9..a8a6d6745e908 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/IExecutionProvider.h
@@ -2,8 +2,15 @@
 // Licensed under the MIT License.
 
 #pragma once
+
+#include <d3d12.h>
+
 #include "core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h"
 
+interface IDMLCompiledOperator;
+struct DML_BUFFER_BINDING;
+struct DML_BINDING_DESC;
+
 namespace Dml
 {
     struct Binding
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 6cd10e14e08d2..4deec620fe5fb 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1356,13 +1356,14 @@ namespace Windows::AI::MachineLearning::Adapter
         const onnxruntime::OpNodeProtoHelper<onnxruntime::ProtoHelperNodeContext>* protoHelper,
         const void* executionHandle,
         bool isInternalOperator,
+        const EdgeShapes* inputShapesOverrides,
         const EdgeShapes* inferredOutputShapes,
         const AttributeMap* defaultAttributes,
         DmlGraphNodeCreateInfo* graphNodeCreateInfo,
         gsl::span<const uint32_t> requiredConstantCpuInputs,
         MLOperatorTensorGetter& constantInputGetter
         )
-    :   OpNodeInfoWrapper(protoHelper, nullptr, defaultAttributes, requiredConstantCpuInputs, constantInputGetter, nullptr),
+    :   OpNodeInfoWrapper(protoHelper, inputShapesOverrides, defaultAttributes, requiredConstantCpuInputs, constantInputGetter, nullptr),
         m_inferredOutputShapes(inferredOutputShapes),
         m_internalOperator(isInternalOperator),
         m_graphNodeCreateInfo(graphNodeCreateInfo)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
index a7f8bebb2de78..913997ff4ad49 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -4,6 +4,7 @@
 #pragma once
 #include "core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h"
 #include "core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlEdgeShapes.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/customregistry.h"
 #include "core/framework/tensorprotoutils.h"
@@ -93,42 +94,6 @@ struct AttributeValue
 
 using AttributeMap = std::map<std::string, AttributeValue>;
 
-// Encapsulation of shapes across different edges of an operator.    Non-tensor
-// edges and unused edges have an empty array of dimensions.
-class EdgeShapes
-{
-public:
-    EdgeShapes() = default;
-
-    EdgeShapes(size_t count) : m_shapes(count) {}
-
-    const std::vector<uint32_t>& GetShape(size_t edgeIndex) const
-    {
-        return m_shapes[edgeIndex];
-    }
-
-    std::vector<uint32_t>& GetMutableShape(size_t edgeIndex)
-    {
-        return m_shapes[edgeIndex];
-    }
-
-    size_t EdgeCount() const { return m_shapes.size(); }
-
-    void Reset(size_t edge_count)
-    {
-        m_shapes.clear();
-        m_shapes.resize(edge_count);
-    }
-
-    bool operator!=(const EdgeShapes& other) const noexcept
-    {
-        return (m_shapes != other.m_shapes);
-    }
-
- private:
-    std::vector<std::vector<uint32_t>> m_shapes;
-};
-
 // Base class for ABI objects which may be "Closed", at which point calls will predictably
 // fail or return a dummy value.  This is used for transient ABI context objects which
 // are passed to methods on kernel or inferencers, and which wrap Lotus objects whose lifetimes
@@ -434,6 +399,7 @@ class DmlGraphOpKernelInfoWrapper : public OpNodeInfoWrapper<
         const onnxruntime::OpNodeProtoHelper<onnxruntime::ProtoHelperNodeContext> * protoHelper,
         const void* executionHandle,
         bool isInternalOperator,
+        const EdgeShapes* inputShapesOverrides,
         const EdgeShapes* inferredOutputShapes,
         const AttributeMap* defaultAttributes,
         DmlGraphNodeCreateInfo* graphNodeCreateInfo,
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index cd8bc8fe909dc..d587424fe01f8 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -30,8 +30,12 @@ namespace onnxruntime {
 
 struct DMLProviderFactory : IExecutionProviderFactory {
   DMLProviderFactory(IDMLDevice* dml_device,
-                     ID3D12CommandQueue* cmd_queue) : dml_device_(dml_device),
-                                                      cmd_queue_(cmd_queue) {}
+                     ID3D12CommandQueue* cmd_queue,
+                     bool disable_metacommands,
+                     bool enable_dynamic_graph_fusion) : dml_device_(dml_device),
+                                                         cmd_queue_(cmd_queue),
+                                                         metacommands_enabled_(!disable_metacommands),
+                                                         dynamic_graph_fusion_enabled_(enable_dynamic_graph_fusion) {}
   ~DMLProviderFactory() override {}
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override;
@@ -42,10 +46,11 @@ struct DMLProviderFactory : IExecutionProviderFactory {
   ComPtr<IDMLDevice> dml_device_{};
   ComPtr<ID3D12CommandQueue> cmd_queue_{};
   bool metacommands_enabled_ = true;
+  bool dynamic_graph_fusion_enabled_ = false;
 };
 
 std::unique_ptr<IExecutionProvider> DMLProviderFactory::CreateProvider() {
-  auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), cmd_queue_.Get(), metacommands_enabled_);
+  auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), cmd_queue_.Get(), metacommands_enabled_, dynamic_graph_fusion_enabled_);
   return provider;
 }
 
@@ -54,7 +59,9 @@ void DMLProviderFactory::SetMetacommandsEnabled(bool metacommands_enabled) {
 }
 
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_DML(IDMLDevice* dml_device,
-                                                                              ID3D12CommandQueue* cmd_queue) {
+                                                                              ID3D12CommandQueue* cmd_queue,
+                                                                              bool disable_metacommands,
+                                                                              bool enable_dynamic_graph_fusion) {
 #ifndef _GAMING_XBOX
   // Validate that the D3D12 devices match between DML and the command queue. This specifically asks for IUnknown in
   // order to be able to compare the pointers for COM object identity.
@@ -73,7 +80,7 @@ std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_DML(ID
   const Env& env = Env::Default();
   auto luid = d3d12_device->GetAdapterLuid();
   env.GetTelemetryProvider().LogExecutionProviderEvent(&luid);
-  return std::make_shared<onnxruntime::DMLProviderFactory>(dml_device, cmd_queue);
+  return std::make_shared<onnxruntime::DMLProviderFactory>(dml_device, cmd_queue, disable_metacommands, enable_dynamic_graph_fusion);
 }
 
 void DmlConfigureProviderFactoryMetacommandsEnabled(IExecutionProviderFactory* factory, bool metacommandsEnabled) {
@@ -234,12 +241,10 @@ static void SortHeterogenousDXCoreAdapterList(
   std::sort(adapter_infos.begin(), adapter_infos.end(), policy);
 }
 
-std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int device_id) {
-  return Create(device_id, /*skip_software_device_check*/ false);
-}
-
 std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFromOptions(
-  OrtDmlDeviceOptions* device_options) {
+    OrtDmlDeviceOptions* device_options,
+    bool disable_metacommands,
+    bool enable_dynamic_graph_fusion) {
   auto default_device_options = OrtDmlDeviceOptions { Default, Gpu };
   if (device_options == nullptr) {
     device_options = &default_device_options;
@@ -286,7 +291,7 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFrom
     adapters.begin(),
     [](auto& a){ return a.Adapter; });
 
-  return onnxruntime::DMLProviderFactoryCreator::CreateFromAdapterList(std::move(adapters));
+  return onnxruntime::DMLProviderFactoryCreator::CreateFromAdapterList(std::move(adapters), disable_metacommands, enable_dynamic_graph_fusion);
 }
 
 static std::optional<OrtDmlPerformancePreference> ParsePerformancePreference(const ProviderOptions& provider_options) {
@@ -354,12 +359,32 @@ static std::optional<int> ParseDeviceId(const ProviderOptions& provider_options)
   return {};
 }
 
+static bool ParseBoolean(const ProviderOptions& provider_options, const std::string& key) {
+  auto preference_it = provider_options.find(key);
+  if (preference_it != provider_options.end() && !preference_it->second.empty()) {
+      if (preference_it->second == "True" || preference_it->second == "true") {
+        return true;
+      } else if (preference_it->second == "False" || preference_it->second == "false") {
+        return false;
+      } else {
+        ORT_THROW("[ERROR] [DirectML] The value for the key '" + key + "' should be 'True' or 'False'. Default value is 'False'.\n");
+      }
+  }
+
+  return false;
+}
+
 std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFromProviderOptions(
-  const ProviderOptions& provider_options) {
+    const ProviderOptions& provider_options) {
+
+  bool disable_metacommands = ParseBoolean(provider_options, "disable_metacommands");
+  bool enable_dynamic_graph_fusion = ParseBoolean(provider_options, "enable_dynamic_graph_fusion");
+  bool skip_software_device_check = false;
   auto device_id = ParseDeviceId(provider_options);
+
   if (device_id.has_value())
   {
-    return onnxruntime::DMLProviderFactoryCreator::Create(device_id.value());
+    return onnxruntime::DMLProviderFactoryCreator::Create(device_id.value(), skip_software_device_check, disable_metacommands, enable_dynamic_graph_fusion);
   }
 
   auto preference = ParsePerformancePreference(provider_options);
@@ -367,7 +392,7 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFrom
 
   // If no preference/filters are specified then create with default preference/filters.
   if (!preference.has_value() && !filter.has_value()) {
-    return onnxruntime::DMLProviderFactoryCreator::CreateFromOptions(nullptr);
+    return onnxruntime::DMLProviderFactoryCreator::CreateFromOptions(nullptr, disable_metacommands, enable_dynamic_graph_fusion);
   }
 
   if (!preference.has_value()) {
@@ -381,7 +406,7 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFrom
   OrtDmlDeviceOptions device_options;
   device_options.Preference = preference.value();
   device_options.Filter = filter.value();
-  return onnxruntime::DMLProviderFactoryCreator::CreateFromOptions(&device_options);
+  return onnxruntime::DMLProviderFactoryCreator::CreateFromOptions(&device_options, disable_metacommands, enable_dynamic_graph_fusion);
 }
 
 Microsoft::WRL::ComPtr<ID3D12Device> DMLProviderFactoryCreator::CreateD3D12Device(
@@ -441,7 +466,10 @@ Microsoft::WRL::ComPtr<IDMLDevice> DMLProviderFactoryCreator::CreateDMLDevice(ID
   return dml_device;
 }
 
-std::shared_ptr<IExecutionProviderFactory> CreateDMLDeviceAndProviderFactory(ID3D12Device* d3d12_device) {
+std::shared_ptr<IExecutionProviderFactory> CreateDMLDeviceAndProviderFactory(
+    ID3D12Device* d3d12_device,
+    bool disable_metacommands,
+    bool enable_dynamic_graph_fusion) {
   D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
   cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
   cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
@@ -450,16 +478,22 @@ std::shared_ptr<IExecutionProviderFactory> CreateDMLDeviceAndProviderFactory(ID3
   ORT_THROW_IF_FAILED(d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_GRAPHICS_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
 
   auto dml_device = onnxruntime::DMLProviderFactoryCreator::CreateDMLDevice(d3d12_device);
-  return CreateExecutionProviderFactory_DML(dml_device.Get(), cmd_queue.Get());
+  return CreateExecutionProviderFactory_DML(dml_device.Get(), cmd_queue.Get(), disable_metacommands, enable_dynamic_graph_fusion);
 }
 
-std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int device_id, bool skip_software_device_check) {
+std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(
+    int device_id,
+    bool skip_software_device_check,
+    bool disable_metacommands,
+    bool enable_dynamic_graph_fusion) {
   ComPtr<ID3D12Device> d3d12_device = CreateD3D12Device(device_id, skip_software_device_check);
-  return CreateDMLDeviceAndProviderFactory(d3d12_device.Get());
+  return CreateDMLDeviceAndProviderFactory(d3d12_device.Get(), disable_metacommands, enable_dynamic_graph_fusion);
 }
 
 std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFromAdapterList(
-    std::vector<ComPtr<IDXCoreAdapter>>&& dxcore_devices) {
+    std::vector<ComPtr<IDXCoreAdapter>>&& dxcore_devices,
+    bool disable_metacommands,
+    bool enable_dynamic_graph_fusion) {
   // Choose the first device from the list since it's the highest priority
   auto dxcore_device = dxcore_devices[0];
 
@@ -467,7 +501,7 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFrom
   ComPtr<ID3D12Device> d3d12_device;
   ORT_THROW_IF_FAILED(D3D12CreateDevice(dxcore_device.Get(), D3D_FEATURE_LEVEL_11_0, IID_GRAPHICS_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
 
-  return CreateDMLDeviceAndProviderFactory(d3d12_device.Get());
+  return CreateDMLDeviceAndProviderFactory(d3d12_device.Get(), disable_metacommands, enable_dynamic_graph_fusion);
 }
 
 }  // namespace onnxruntime
@@ -477,7 +511,7 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::CreateFrom
 // The OrtSessionOptionsAppendExecutionProvider_DML export on the OrtDmlApi should be used instead.
 ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_DML, _In_ OrtSessionOptions* options, int device_id) {
 API_IMPL_BEGIN
-  options->provider_factories.push_back(onnxruntime::DMLProviderFactoryCreator::Create(device_id));
+  options->provider_factories.push_back(onnxruntime::DMLProviderFactoryCreator::Create(device_id, false, false, false));
 API_IMPL_END
   return nullptr;
 }
@@ -489,7 +523,9 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProviderEx_DML, _In_ OrtSess
                     _In_ IDMLDevice* dml_device, _In_ ID3D12CommandQueue* cmd_queue) {
 API_IMPL_BEGIN
   options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_DML(dml_device,
-                                                                                        cmd_queue));
+                                                                                        cmd_queue,
+                                                                                        false,
+                                                                                        false));
 API_IMPL_END
   return nullptr;
 }
@@ -517,7 +553,7 @@ ORT_API_STATUS_IMPL(FreeGPUAllocation, _In_ void* ptr) {
 ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_DML2, _In_ OrtSessionOptions* options, OrtDmlDeviceOptions* device_options) {
 API_IMPL_BEGIN
 #ifdef USE_DML
-  auto factory = onnxruntime::DMLProviderFactoryCreator::CreateFromOptions(device_options);
+  auto factory = onnxruntime::DMLProviderFactoryCreator::CreateFromOptions(device_options, false, false);
   // return the create function for a dxcore device
   options->provider_factories.push_back(factory);
 #endif  // USE_DML
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
index 4e13330a4cd71..0fab9fe902526 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
+++ b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
@@ -17,15 +17,24 @@
 namespace onnxruntime {
 
 struct DMLProviderFactoryCreator {
-  static std::shared_ptr<IExecutionProviderFactory> Create(int device_id);
-  static std::shared_ptr<IExecutionProviderFactory> Create(int device_id, bool skip_software_device_check);
+  static std::shared_ptr<IExecutionProviderFactory> Create(
+    int device_id,
+    bool skip_software_device_check,
+    bool disable_metacommands,
+    bool enable_dynamic_graph_fusion);
 
   static std::shared_ptr<IExecutionProviderFactory> CreateFromProviderOptions(
     const ProviderOptions& provider_options_map);
-  static std::shared_ptr<IExecutionProviderFactory> CreateFromOptions(OrtDmlDeviceOptions* device_options);
+
+  static std::shared_ptr<IExecutionProviderFactory> CreateFromOptions(
+    OrtDmlDeviceOptions* device_options,
+    bool disable_metacommands,
+    bool enable_dynamic_graph_fusion);
 
   static std::shared_ptr<IExecutionProviderFactory> CreateFromAdapterList(
-	  std::vector<Microsoft::WRL::ComPtr<IDXCoreAdapter>>&& dxcore_devices);
+    std::vector<Microsoft::WRL::ComPtr<IDXCoreAdapter>>&& dxcore_devices,
+    bool disable_metacommands,
+    bool enable_dynamic_graph_fusion);
 
   static Microsoft::WRL::ComPtr<ID3D12Device> CreateD3D12Device(int device_id, bool skip_software_device_check);
   static Microsoft::WRL::ComPtr<IDMLDevice> CreateDMLDevice(ID3D12Device* d3d12_device);
diff --git a/onnxruntime/core/providers/js/operators/cast.cc b/onnxruntime/core/providers/js/operators/cast.cc
index f05e1eac4329c..9b6ac6d7e253b 100644
--- a/onnxruntime/core/providers/js/operators/cast.cc
+++ b/onnxruntime/core/providers/js/operators/cast.cc
@@ -14,8 +14,7 @@ const std::vector<MLDataType>& CastOpTypeConstraints() {
   // https://gpuweb.github.io/gpuweb/wgsl/#plain-types-section
   //
   static std::vector<MLDataType> types{
-      // TODO(fs-eire): support f16 when it's ready
-      // DataTypeImpl::GetTensorType<MLFloat16>(),
+      DataTypeImpl::GetTensorType<MLFloat16>(),
       DataTypeImpl::GetTensorType<float>(),
       DataTypeImpl::GetTensorType<int32_t>(),
       DataTypeImpl::GetTensorType<uint32_t>(),
diff --git a/onnxruntime/core/providers/js/operators/resize.cc b/onnxruntime/core/providers/js/operators/resize.cc
index 7619c33a477aa..5b2e385777a37 100644
--- a/onnxruntime/core/providers/js/operators/resize.cc
+++ b/onnxruntime/core/providers/js/operators/resize.cc
@@ -5,15 +5,15 @@
 
 namespace onnxruntime {
 namespace js {
-#define REGISTER_RESIZE_VERSIONED_10_10_KERNEL(domain)                \
-  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                  \
-      Resize,                                                         \
-      domain,                                                         \
-      10, 10,                                                         \
-      kJsExecutionProvider,                                           \
-      (*KernelDefBuilder::Create())                                   \
-          .InputMemoryType(OrtMemTypeCPUInput, 1)                     \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()), \
+#define REGISTER_RESIZE_VERSIONED_10_10_KERNEL(domain)     \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                       \
+      Resize,                                              \
+      domain,                                              \
+      10, 10,                                              \
+      kJsExecutionProvider,                                \
+      (*KernelDefBuilder::Create())                        \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)          \
+          .TypeConstraint("T", JsepSupportedFloatTypes()), \
       Resize);
 
 #define REGISTER_RESIZE_VERSIONED_KERNEL(domain, sinceVersion, endVerion) \
@@ -26,22 +26,22 @@ namespace js {
           .InputMemoryType(OrtMemTypeCPUInput, 1)                         \
           .InputMemoryType(OrtMemTypeCPUInput, 2)                         \
           .InputMemoryType(OrtMemTypeCPUInput, 3)                         \
-          .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())     \
-          .TypeConstraint("T2", DataTypeImpl::GetTensorType<float>()),    \
+          .TypeConstraint("T1", JsepSupportedFloatTypes())                \
+          .TypeConstraint("T2", JsepSupportedFloatTypes()),               \
       Resize);
 
-#define REGISTER_RESIZE_KERNEL(domain, sinceVersion)                   \
-  ONNX_OPERATOR_KERNEL_EX(                                             \
-      Resize,                                                          \
-      domain,                                                          \
-      sinceVersion,                                                    \
-      kJsExecutionProvider,                                            \
-      (*KernelDefBuilder::Create())                                    \
-          .InputMemoryType(OrtMemTypeCPUInput, 1)                      \
-          .InputMemoryType(OrtMemTypeCPUInput, 2)                      \
-          .InputMemoryType(OrtMemTypeCPUInput, 3)                      \
-          .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())  \
-          .TypeConstraint("T2", DataTypeImpl::GetTensorType<float>()), \
+#define REGISTER_RESIZE_KERNEL(domain, sinceVersion)        \
+  ONNX_OPERATOR_KERNEL_EX(                                  \
+      Resize,                                               \
+      domain,                                               \
+      sinceVersion,                                         \
+      kJsExecutionProvider,                                 \
+      (*KernelDefBuilder::Create())                         \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)           \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)           \
+          .InputMemoryType(OrtMemTypeCPUInput, 3)           \
+          .TypeConstraint("T1", JsepSupportedFloatTypes())  \
+          .TypeConstraint("T2", JsepSupportedFloatTypes()), \
       Resize);
 
 #define REGISTER_RESIZE_KERNEL_DOMAIN(domain)       \
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 74d237a62f73d..ef1f0bf9f8d0e 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1210,6 +1210,12 @@ Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream) {
 }
 
 void TensorrtExecutionProvider::GetCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list) const {
+  if (info_.custom_op_domain_list.empty()) {
+    common::Status status = CreateTensorRTCustomOpDomainList(info_);
+    if (!status.IsOK()) {
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration.";
+    }
+  }
   custom_op_domain_list = info_.custom_op_domain_list;
 }
 
@@ -1869,6 +1875,7 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   } else if (number_of_trt_nodes == number_of_ort_nodes) {
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider";
   } else {
+    sync_stream_after_enqueue_ = true;
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs;
   }
 
@@ -2387,7 +2394,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name,
             &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name],
             &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
-            input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
+            input_shape_ranges_[context->node_name], sync_stream_after_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
             dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
             runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
             dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
@@ -2415,6 +2422,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
       const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
       const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
+      bool sync_stream_after_enqueue = trt_state->sync_stream_after_enqueue;
       auto fused_node_name = trt_state->fused_node_name;
       auto& shape_ranges = trt_state->input_shape_ranges;
       auto trt_builder = trt_state->builder->get();
@@ -3022,6 +3030,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
       }
 
+      if (sync_stream_after_enqueue) {
+        cudaStreamSynchronize(stream);
+      }
+
       // Cast INT64 input to INT32 because TensorRT doesn't fully support INT64
       for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
         const std::string& output_name = output_binding_names[i];
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 64ab2db2aedc9..24c391ee11b84 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -111,6 +111,7 @@ struct TensorrtFuncState {
   std::vector<std::unordered_map<std::string, size_t>> input_info;
   std::vector<std::unordered_map<std::string, size_t>> output_info;
   std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> input_shape_ranges;
+  bool sync_stream_after_enqueue = false;
   OrtMutex* tensorrt_mu_ptr = nullptr;
   bool fp16_enable = false;
   bool int8_enable = false;
@@ -196,7 +197,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   Status ReplayGraph() override;
 
  private:
-  TensorrtExecutionProviderInfo info_;
+  mutable TensorrtExecutionProviderInfo info_;
   bool external_stream_ = false;
   cudaStream_t stream_ = nullptr;
   int max_partition_iterations_ = 1000;
@@ -262,6 +263,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   cudnnHandle_t external_cudnn_handle_ = nullptr;
   cublasHandle_t external_cublas_handle_ = nullptr;
 
+  // Call cudaStreamSynchronize() after TRT enqueueV2()/enqueueV3()
+  mutable bool sync_stream_after_enqueue_ = false;
+
   CUDAGraph cuda_graph_;
   bool is_graph_captured_ = false;
   int regular_run_count_before_graph_capture_ = 0;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index b5dbe1ac459b1..d7e13df000272 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -75,11 +75,6 @@ struct Tensorrt_Provider : Provider {
     info.device_id = device_id;
     info.has_trt_options = false;
 
-    common::Status status = CreateTensorRTCustomOpDomainList(info);
-    if (!status.IsOK()) {
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration.";
-    }
-
     return std::make_shared<TensorrtProviderFactory>(info);
   }
 
@@ -121,11 +116,6 @@ struct Tensorrt_Provider : Provider {
     info.profile_opt_shapes = options.trt_profile_opt_shapes == nullptr ? "" : options.trt_profile_opt_shapes;
     info.cuda_graph_enable = options.trt_cuda_graph_enable != 0;
 
-    common::Status status = CreateTensorRTCustomOpDomainList(info);
-    if (!status.IsOK()) {
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Failed to get TRT plugins from TRT plugin registration.";
-    }
-
     return std::make_shared<TensorrtProviderFactory>(info);
   }
 
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index b4d47652942b7..1163be27b1685 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -52,8 +52,10 @@
 #include "core/providers/cpu/cpu_execution_provider.h"
 #ifdef USE_DML  // TODO: This is necessary for the workaround in TransformGraph
 #include "core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlRuntimeGraphFusionTransformer.h"
 #include "core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h"
 #include "core/providers/dml/dml_session_options_config_keys.h"
+#include "core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h"
 #endif
 #include "core/session/environment.h"
 #include "core/session/user_logging_sink.h"
@@ -613,9 +615,35 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr
   }
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
-  // Create Custom Op if EP requests it
+  // Register Custom Op if EP requests it
   std::vector<OrtCustomOpDomain*> custom_op_domains;
-  p_exec_provider->GetCustomOpDomainList(custom_op_domains);
+  std::vector<OrtCustomOpDomain*> candidate_custom_op_domains;
+  p_exec_provider->GetCustomOpDomainList(candidate_custom_op_domains);
+
+  auto registry_kernels = kernel_registry_manager_.GetKernelRegistriesByProviderType(p_exec_provider->Type());
+
+  // Register the custom op domain only if it has not been registered before
+  if (registry_kernels.empty()) {
+    custom_op_domains = candidate_custom_op_domains;
+  } else {
+    for (auto candidate_custom_op_domain : candidate_custom_op_domains) {
+      for (auto registry_kernel : registry_kernels) {
+        const auto& kernel_map = registry_kernel->GetKernelCreateMap();
+        bool need_register = true;
+        // If the kernel registry is the ep's custom op registry, we only need to check the first kernel,
+        // because all kernels in one kernel registry should have the same domain name.
+        for (auto iter = kernel_map.begin(); iter != kernel_map.end(); iter++) {
+          if (iter->second.kernel_def->Domain() == candidate_custom_op_domain->domain_) {
+            need_register = false;
+            break;
+          }
+        }
+        if (need_register) {
+          custom_op_domains.push_back(candidate_custom_op_domain);
+        }
+      }
+    }
+  }
 
   if (!custom_op_domains.empty()) {
     if (AddCustomOpDomains(custom_op_domains) != Status::OK()) {
@@ -984,14 +1012,25 @@ common::Status InferenceSession::Load() {
 
 common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool saving_model_in_ort_format) {
   // The transformer order:
-  // 1. ensure potential QDQ node units have unique DQ nodes (required transformer).
+  // 1. Ensure we inline as many functions as possible. We refer to it as Ahead Of Time (AOT) function inlining.
+  // 2. ensure potential QDQ node units have unique DQ nodes (required transformer).
   //    - This is a required transformer as the ORT code has a hard requirement there are no overlapping QDQ node units.
   //    - We run it here in case optimizers are disabled.
-  // 2. run level 1 optimizations. these only use ONNX operators.
-  // 3. partition nodes based on EP capabilities. EPs may fuse nodes during this process.
-  // 4. run level 2+ optimizations. level 2 and 3 optimizations use contrib ops.
-  // 5. insert cast nodes (required transformer).
-  // 6. insert copy nodes (required transformer).
+  // 3. run level 1 optimizations. these only use ONNX operators.
+  // 4. partition nodes based on EP capabilities. EPs may fuse nodes during this process.
+  // 5. run level 2+ optimizations. level 2 and 3 optimizations use contrib ops.
+  // 6. insert cast nodes (required transformer).
+  // 7. insert copy nodes (required transformer).
+
+  // Run Ahead Of time function inlining
+  GraphPartitioner partitioner(kernel_registry_manager_, execution_providers_);
+  if (const bool disable_aot_function_inlining =
+          session_options_.config_options.GetConfigOrDefault(
+              kOrtSessionOptionsDisableAheadOfTimeFunctionInlining, "0") == "1";
+      !disable_aot_function_inlining) {
+    ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.InlineFunctionsAOT(*model_,
+                                                                  execution_providers_, kernel_registry_manager_));
+  }
 
   auto apply_transformer_once = [](const GraphTransformer& transformer, const logging::Logger& logger,
                                    Graph& graph) {
@@ -1075,7 +1114,6 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
   }
 
   // Do partitioning based on execution providers' capabilities.
-  GraphPartitioner partitioner(kernel_registry_manager_, execution_providers_);
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
                                                        mode, debug_graph_fn));
 
@@ -1562,7 +1600,9 @@ common::Status InferenceSession::Initialize() {
                                                                record_runtime_optimization_produced_op_schema));
 
 #ifdef USE_DML
-      if (execution_providers_.Get(kDmlExecutionProvider)) {
+      const IExecutionProvider* dmlExecutionProvider = execution_providers_.Get(kDmlExecutionProvider);
+
+      if (dmlExecutionProvider) {
         // DML graph fusion is an important runtime optimization that cannot be done ahead of time; it must be disabled
         // when running in "offline mode" and saving an optimized model to disk. To support users that want to optimize
         // models offline, and then disable graph optimizations when running "online", this transformer ignores the ORT
@@ -1572,11 +1612,20 @@ common::Status InferenceSession::Initialize() {
 
         if (dml_graph_fusion_enabled) {
           std::unique_ptr<onnxruntime::GraphTransformer> dmlGraphFusionTransformer = std::make_unique<Dml::DmlGraphFusionTransformer>("DmlGraphFusionTransformer",
-                                                                                                                                      execution_providers_.Get(kDmlExecutionProvider));
+                                                                                                                                      dmlExecutionProvider);
           if (dmlGraphFusionTransformer == nullptr) {
             return Status(common::ONNXRUNTIME, common::FAIL, "DmlGraphFusionTransformer is nullptr");
           }
           ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr_.Register(std::move(dmlGraphFusionTransformer), onnxruntime::TransformerLevel::Level3));
+
+          if (static_cast<const Dml::ExecutionProvider*>(dmlExecutionProvider)->DynamicGraphFusionEnabled()) {
+            std::unique_ptr<onnxruntime::GraphTransformer> dmlRuntimeGraphFusionTransformer = std::make_unique<Dml::DmlRuntimeGraphFusionTransformer>("DmlRuntimeGraphFusionTransformer",
+                                                                                                                                                      dmlExecutionProvider);
+            if (dmlRuntimeGraphFusionTransformer == nullptr) {
+              return Status(common::ONNXRUNTIME, common::FAIL, "DmlRuntimeGraphFusionTransformer is nullptr");
+            }
+            ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr_.Register(std::move(dmlRuntimeGraphFusionTransformer), onnxruntime::TransformerLevel::Level3));
+          }
         }
 
         // This transformer applies DML-specific fusions that go beyond what ORT offers by default
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index d950223f2d108..d307f79c372ed 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1625,6 +1625,28 @@ ProviderOptions GetProviderInfo_Cuda(const OrtCUDAProviderOptionsV2* provider_op
 
 }  // namespace onnxruntime
 
+void AddTensorRTCustomOpDomainToSessionOption(OrtSessionOptions* options, std::string extra_plugin_lib_paths) {
+  auto is_already_in_domains = [&](std::string& domain_name, std::vector<OrtCustomOpDomain*>& domains) {
+    for (auto ptr : domains) {
+      if (domain_name == ptr->domain_) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  std::vector<OrtCustomOpDomain*> custom_op_domains;
+  onnxruntime::ProviderInfo_TensorRT& provider_info = onnxruntime::GetProviderInfo_TensorRT();
+  provider_info.GetTensorRTCustomOpDomainList(custom_op_domains, extra_plugin_lib_paths);
+  for (auto ptr : custom_op_domains) {
+    if (!is_already_in_domains(ptr->domain_, options->custom_op_domains_)) {
+      options->custom_op_domains_.push_back(ptr);
+    } else {
+      LOGS_DEFAULT(WARNING) << "The custom op domain name " << ptr->domain_ << " is already in session option.";
+    }
+  }
+}
+
 ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Dnnl, _In_ OrtSessionOptions* options, int use_arena) {
   API_IMPL_BEGIN
   auto factory = onnxruntime::DnnlProviderFactoryCreator::Create(use_arena);
@@ -1646,13 +1668,8 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtS
 
   options->provider_factories.push_back(factory);
 
-  std::vector<OrtCustomOpDomain*> custom_op_domains;
   std::string extra_plugin_lib_paths = onnxruntime::Env::Default().GetEnvironmentVar("trt_extra_plugin_lib_paths");
-  onnxruntime::ProviderInfo_TensorRT& provider_info = onnxruntime::GetProviderInfo_TensorRT();
-  provider_info.GetTensorRTCustomOpDomainList(custom_op_domains, extra_plugin_lib_paths);
-  for (auto ptr : custom_op_domains) {
-    options->custom_op_domains_.push_back(ptr);
-  }
+  AddTensorRTCustomOpDomainToSessionOption(options, extra_plugin_lib_paths);
 
   return nullptr;
   API_IMPL_END
@@ -1679,12 +1696,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, _In
 
   options->provider_factories.push_back(factory);
 
-  std::vector<OrtCustomOpDomain*> custom_op_domains;
-  onnxruntime::ProviderInfo_TensorRT& provider_info = onnxruntime::GetProviderInfo_TensorRT();
-  provider_info.GetTensorRTCustomOpDomainList(custom_op_domains, "");
-  for (auto ptr : custom_op_domains) {
-    options->custom_op_domains_.push_back(ptr);
-  }
+  AddTensorRTCustomOpDomainToSessionOption(options, "");
 
   return nullptr;
   API_IMPL_END
@@ -1788,13 +1800,8 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2,
 
   options->provider_factories.push_back(factory);
 
-  std::vector<OrtCustomOpDomain*> custom_op_domains;
   std::string extra_plugin_lib_paths = (tensorrt_options == nullptr || tensorrt_options->trt_extra_plugin_lib_paths == nullptr) ? "" : tensorrt_options->trt_extra_plugin_lib_paths;
-  onnxruntime::ProviderInfo_TensorRT& provider_info = onnxruntime::GetProviderInfo_TensorRT();
-  provider_info.GetTensorRTCustomOpDomainList(custom_op_domains, extra_plugin_lib_paths);
-  for (auto ptr : custom_op_domains) {
-    options->custom_op_domains_.push_back(ptr);
-  }
+  AddTensorRTCustomOpDomainToSessionOption(options, extra_plugin_lib_paths);
 
   return nullptr;
   API_IMPL_END
diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc
index 52ea677d5141d..04dfa9b51e112 100644
--- a/onnxruntime/python/onnxruntime_pybind_quant.cc
+++ b/onnxruntime/python/onnxruntime_pybind_quant.cc
@@ -6,6 +6,7 @@
 #include <pybind11/functional.h>
 
 #include "contrib_ops/cpu/quantization/dequantize_blockwise.h"
+#include "contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h"
 #include "core/util/thread_utils.h"
 
 namespace pybind11 {
@@ -64,9 +65,39 @@ void QuantizeMatMul4BitsBlockwise(
       tp.get());
 }
 
+template <typename T>
+void QuantizeMatMulBnb4Blockwise(
+    py::array_t<uint8_t> dst,
+    py::array_t<T> src,
+    py::array_t<T> absmax,
+    int32_t block_size,
+    int32_t quant_type,
+    int32_t N,
+    int32_t K) {
+  OrtThreadPoolParams to;
+  auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to,
+                                          concurrency::ThreadPoolType::INTRA_OP);
+
+  py::buffer_info dst_buf = dst.request();
+  py::buffer_info src_buf = src.request();
+  py::buffer_info absmax_buf = absmax.request();
+
+  contrib::QuantizeBlockwiseBnb4<T>(
+      static_cast<uint8_t*>(dst_buf.ptr),
+      static_cast<const T*>(src_buf.ptr),
+      static_cast<T*>(absmax_buf.ptr),
+      block_size,
+      quant_type,
+      N,
+      K,
+      tp.get());
+}
+
 void CreateQuantPybindModule(py::module& m) {
   m.def("quantize_matmul_4bits", &QuantizeMatMul4BitsBlockwise<float>);
   m.def("quantize_matmul_4bits", &QuantizeMatMul4BitsBlockwise<MLFloat16>);
+  m.def("quantize_matmul_bnb4", &QuantizeMatMulBnb4Blockwise<float>);
+  m.def("quantize_matmul_bnb4", &QuantizeMatMulBnb4Blockwise<MLFloat16>);
 }
 
 }  // namespace python
diff --git a/onnxruntime/python/onnxruntime_pybind_schema.cc b/onnxruntime/python/onnxruntime_pybind_schema.cc
index a8c217b0ff1f6..3a977772873f3 100644
--- a/onnxruntime/python/onnxruntime_pybind_schema.cc
+++ b/onnxruntime/python/onnxruntime_pybind_schema.cc
@@ -59,7 +59,7 @@ void addGlobalSchemaFunctions(pybind11::module& m) {
             onnxruntime::ArmNNProviderFactoryCreator::Create(0),
 #endif
 #ifdef USE_DML
-            onnxruntime::DMLProviderFactoryCreator::Create(0, /*skip_software_device_check*/ true),
+            onnxruntime::DMLProviderFactoryCreator::Create(0, false, false, false),
 #endif
 #ifdef USE_NNAPI
             onnxruntime::NnapiProviderFactoryCreator::Create(0, std::optional<std::string>()),
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 35e03bf9eacd5..a72f563601512 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -433,6 +433,15 @@ const ROCMExecutionProviderInfo GetRocmExecutionProviderInfo(ProviderInfo_ROCM*
 #ifdef USE_TENSORRT
 void RegisterTensorRTPluginsAsCustomOps(PySessionOptions& so, const ProviderOptions& options) {
   if (auto* tensorrt_provider_info = TryGetProviderInfo_TensorRT()) {
+    auto is_already_in_domains = [&](std::string& domain_name, std::vector<OrtCustomOpDomain*>& domains) {
+      for (auto ptr : domains) {
+        if (domain_name == ptr->domain_) {
+          return true;
+        }
+      }
+      return false;
+    };
+
     std::string trt_extra_plugin_lib_paths = "";
     const auto it = options.find("trt_extra_plugin_lib_paths");
     if (it != options.end()) {
@@ -441,7 +450,11 @@ void RegisterTensorRTPluginsAsCustomOps(PySessionOptions& so, const ProviderOpti
     std::vector<OrtCustomOpDomain*> domain_list;
     tensorrt_provider_info->GetTensorRTCustomOpDomainList(domain_list, trt_extra_plugin_lib_paths);
     for (auto ptr : domain_list) {
-      so.custom_op_domains_.push_back(ptr);
+      if (!is_already_in_domains(ptr->domain_, so.custom_op_domains_)) {
+        so.custom_op_domains_.push_back(ptr);
+      } else {
+        LOGS_DEFAULT(WARNING) << "The custom op domain name " << ptr->domain_ << " is already in session option.";
+      }
     }
   } else {
     ORT_THROW("Please install TensorRT libraries as mentioned in the GPU requirements page, make sure they're in the PATH or LD_LIBRARY_PATH, and that your GPU is supported.");
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/cuda/dequant_blockwise_bnb4.cu b/onnxruntime/python/tools/kernel_explorer/kernels/cuda/dequant_blockwise_bnb4.cu
new file mode 100644
index 0000000000000..3504ce1bebe8c
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/cuda/dequant_blockwise_bnb4.cu
@@ -0,0 +1,89 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file serve as a simple example for adding a tunable op to onnxruntime.
+
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+#include <pybind11/pybind11.h>
+
+#include <string>
+
+#include "core/providers/cuda/tunable/cuda_tunable.h"
+#include "python/tools/kernel_explorer/kernel_explorer_interface.h"
+#include "python/tools/kernel_explorer/device_array.h"
+#include "contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cuh"
+
+namespace py = pybind11;
+
+namespace onnxruntime {
+
+// Extend the OpParams so that all specializations have the same parameter passing interface
+template <typename T>
+struct DequantizeBnb4Params : cuda::tunable::OpParams {
+  std::string Signature() const override { return std::to_string(n_); }
+
+  int quant_type_;
+  T* output_;
+  const uint8_t* quant_;
+  const T* absmax_;
+  T* quant_map_buffer_;
+  int n_;
+  int k_;
+};
+
+template <typename T>
+class DequantizeBnb4 : public IKernelExplorer {
+ public:
+  DequantizeBnb4(
+      int quant_type,
+      DeviceArray& output,
+      DeviceArray& quant,
+      DeviceArray& absmax,
+      DeviceArray& quant_map_buffer,
+      int n, int k) {
+    params_.tuning_ctx = TuningContext();
+    params_.stream = Stream();
+    params_.quant_type_ = quant_type;
+    params_.output_ = static_cast<T*>(output.ptr());
+    params_.quant_ = static_cast<uint8_t*>(quant.ptr());
+    params_.absmax_ = static_cast<T*>(absmax.ptr());
+    params_.quant_map_buffer_ = static_cast<T*>(quant_map_buffer.ptr());
+    params_.n_ = n;
+    params_.k_ = k;
+  }
+
+  void Run() override {
+    ORT_THROW_IF_ERROR(contrib::cuda::SetBnbQuantMap(
+        params_.quant_type_,
+        params_.quant_map_buffer_,
+        params_.StreamHandle()));
+    ORT_THROW_IF_ERROR(contrib::cuda::DequantizeBnb4(
+      params_.quant_map_buffer_,
+        params_.output_,
+        params_.quant_,
+        params_.absmax_,
+        64,
+        params_.n_ * params_.k_,
+        params_.StreamHandle()));
+  }
+
+ private:
+  // A VectorAddOp<T> is a callable that can process const VectorAddParams<T>*
+  using ParamsT = DequantizeBnb4Params<T>;
+  ParamsT params_{};
+};
+
+#define REGISTER_OP(name, type)                                                               \
+  py::class_<name<type>>(m, #name "_" #type)                                                  \
+      .def(py::init<int, DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, int, int>()) \
+      .def("SetRepeats", &name<type>::SetRepeats)                                             \
+      .def("Profile", &name<type>::Profile)                                                   \
+      .def("Run", &name<type>::Run);
+
+KE_REGISTER(m) {
+  REGISTER_OP(DequantizeBnb4, half);
+  REGISTER_OP(DequantizeBnb4, float);
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/cuda/matmul_bnb4.cu b/onnxruntime/python/tools/kernel_explorer/kernels/cuda/matmul_bnb4.cu
new file mode 100644
index 0000000000000..e4cd83565357a
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/cuda/matmul_bnb4.cu
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file serve as a simple example for adding a tunable op to onnxruntime.
+
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+#include <pybind11/pybind11.h>
+
+#include <string>
+
+#include "core/providers/cuda/tunable/cuda_tunable.h"
+#include "python/tools/kernel_explorer/kernel_explorer_interface.h"
+#include "python/tools/kernel_explorer/kernels/vector_add_kernel.cuh"
+#include "contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cuh"
+#include "contrib_ops/cuda/quantization/matmul_bnb4.cuh"
+
+namespace py = pybind11;
+
+namespace onnxruntime {
+
+// Extend the OpParams so that all specializations have the same parameter passing interface
+template <typename T>
+struct MatrixFloatBnb4Params : cuda::tunable::OpParams {
+  std::string Signature() const override { return std::to_string(n_); }
+
+  int quant_type_;
+  T* output_;
+  const T* a_;
+  const uint8_t* b_;
+  const T* absmax_;
+  T* quant_map_buffer_;
+  int m_;
+  int n_;
+  int k_;
+};
+
+template <typename T>
+class MatrixFloatBnb4 : public IKernelExplorer {
+ public:
+  MatrixFloatBnb4(DeviceArray& output,
+                  DeviceArray& a,
+                  DeviceArray& b,
+                  DeviceArray& absmax,
+                  DeviceArray& quant_map_buffer,
+                  int quant_type, int m, int n, int k) {
+    params_.tuning_ctx = TuningContext();
+    params_.stream = Stream();
+    params_.output_ = static_cast<T*>(output.ptr());
+    params_.a_ = static_cast<T*>(a.ptr());
+    params_.b_ = static_cast<uint8_t*>(b.ptr());
+    params_.absmax_ = static_cast<T*>(absmax.ptr());
+    params_.quant_map_buffer_ = static_cast<T*>(quant_map_buffer.ptr());
+    params_.quant_type_ = quant_type;
+    params_.m_ = m;
+    params_.n_ = n;
+    params_.k_ = k;
+  }
+
+  void Run() override {
+    ORT_THROW_IF_ERROR(contrib::cuda::SetBnbQuantMap(
+        params_.quant_type_,
+        params_.quant_map_buffer_,
+        params_.StreamHandle()));
+    contrib::cuda::TryMatMulBnb4(
+        params_.quant_map_buffer_,
+        params_.output_,
+        params_.a_,
+        params_.b_,
+        params_.absmax_,
+        params_.m_,
+        params_.n_,
+        params_.k_,
+        64,
+        params_.StreamHandle());
+  }
+
+ private:
+  // A VectorAddOp<T> is a callable that can process const VectorAddParams<T>*
+  using ParamsT = MatrixFloatBnb4Params<T>;
+  ParamsT params_{};
+};
+
+#define REGISTER_OP(name, type)                                                                                  \
+  py::class_<name<type>>(m, #name "_" #type)                                                                     \
+      .def(py::init<DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, DeviceArray&, int, int, int, int>()) \
+      .def("SetRepeats", &name<type>::SetRepeats)                                                                \
+      .def("Profile", &name<type>::Profile)                                                                      \
+      .def("Run", &name<type>::Run);
+
+KE_REGISTER(m) {
+  REGISTER_OP(MatrixFloatBnb4, half);
+  REGISTER_OP(MatrixFloatBnb4, float);
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_bnb4.py b/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_bnb4.py
new file mode 100644
index 0000000000000..140151aadcc0f
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_blockwise_bnb4.py
@@ -0,0 +1,92 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import sys
+from dataclasses import dataclass
+
+import kernel_explorer as ke
+import numpy as np
+from utils import dtype_to_bytes
+
+
+def dtype_to_funcs(dtype):
+    type_map = {
+        "float16": list(filter(lambda x: "DequantizeBnb4_half" in x, dir(ke))),
+        "float32": list(filter(lambda x: "DequantizeBnb4_float" in x, dir(ke))),
+    }
+    return type_map[dtype]
+
+
+quant_enums = {"FP4": 0, "NF4": 1}
+
+
+dtypes = ["float16", "float32"]
+quant_types = ["FP4", "NF4"]
+
+
+@dataclass
+class DequantizeBnb4Metric(ke.BandwidthMetric):
+    quant_type: str
+    n: int
+    k: int
+
+    def report(self):
+        return (
+            f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s"
+            f" {self.quant_type} {self.dtype} n={self.n} k={self.k} {self.name}"
+        )
+
+
+def profile_dequantize_int4_func(qt, n, k, dtype, func):
+    np.random.seed(0)
+    block_size = 64
+    numel = n * k
+    output = np.random.rand(n, k).astype(dtype)
+    quant = np.random.randint(low=0, high=255, size=(numel + 1) // 2).astype("uint8")
+    absmax = np.random.rand((numel + block_size - 1) // block_size).astype(dtype)
+    quant_map_buffer = np.zeros(16).astype(dtype)
+
+    output_d = ke.DeviceArray(output)
+    quant_d = ke.DeviceArray(quant)
+    absmax_d = ke.DeviceArray(absmax)
+    quant_map_buffer_d = ke.DeviceArray(quant_map_buffer)
+    f = getattr(ke, func)
+    my_op = f(quant_enums[qt], output_d, quant_d, absmax_d, quant_map_buffer_d, n, k)
+    duration_ms = my_op.Profile()
+    total_bytes = numel / 2 + (numel + numel / block_size) * dtype_to_bytes(dtype)
+
+    ke.report(DequantizeBnb4Metric(func, dtype, duration_ms, total_bytes, qt, n, k))
+
+
+def profile_with_args(qt, n, k, dtype, sort):
+    with ke.benchmark(sort):
+        for func in dtype_to_funcs(dtype):
+            profile_dequantize_int4_func(qt, n, k, dtype, func)
+
+
+def profile():
+    for qt in quant_types:
+        for dt in dtypes:
+            for n, k in ((4096, 4096), (4096, 12288), (12288, 4096)):
+                profile_with_args(qt, n, k, dt, True)
+                print()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group("profile with args")
+    group.add_argument("n", type=int)
+    group.add_argument("k", type=int)
+    group.add_argument("quant_type", choices=quant_types)
+    group.add_argument("dtype", choices=dtypes)
+    group.add_argument("--sort", action="store_true")
+
+    if len(sys.argv) == 1:
+        profile()
+    else:
+        args = parser.parse_args()
+        profile_with_args(args.quant_type, args.n, args.k, args.dtype, args.sort)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_bnb4.py b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_bnb4.py
new file mode 100644
index 0000000000000..4a9489050fd61
--- /dev/null
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_bnb4.py
@@ -0,0 +1,136 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import sys
+from dataclasses import dataclass
+
+import kernel_explorer as ke
+import numpy as np
+from utils import dtype_to_bytes
+
+
+def dtype_to_funcs(dtype):
+    type_map = {
+        "float16": list(filter(lambda x: "MatrixFloatBnb4_half" in x, dir(ke))),
+        "float32": list(filter(lambda x: "MatrixFloatBnb4_float" in x, dir(ke))),
+    }
+    return type_map[dtype]
+
+
+def dtype_to_funcs_cublas(dtype):
+    type_map = {
+        "float16": list(filter(lambda x: "GemmBenchmark_half" in x, dir(ke))),
+        "float32": list(filter(lambda x: "GemmBenchmark_float" in x, dir(ke))),
+    }
+    return type_map[dtype]
+
+
+quant_enums = {"FP4": 0, "NF4": 1}
+
+
+dtypes = ["float16", "float32"]
+quant_types = ["FP4", "NF4"]
+
+
+@dataclass
+class MatrixMulMetric(ke.BandwidthMetric):
+    m: int
+    n: int
+    k: int
+
+    def report(self):
+        return (
+            f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s {self.dtype} m={self.m} n={self.n} k={self.k} {self.name}"
+        )
+
+
+@dataclass
+class MatrixFpBnb4Metric(MatrixMulMetric):
+    quant_type: str
+
+    def report(self):
+        return (
+            f"{self.duration:6.2f} us {self.gbps:5.2f} GB/s"
+            f" {self.quant_type} {self.dtype} m={self.m} n={self.n} k={self.k} {self.name}"
+        )
+
+
+def profile_matmul_fp_bnb4_func(qt, m, n, k, dtype, func):
+    np.random.seed(0)
+    block_size = 64
+    numel = n * k
+    output = np.random.rand(m, n).astype(dtype)
+    a = np.random.rand(m, k).astype(dtype)
+    b = np.random.randint(low=0, high=255, size=(numel + 1) // 2).astype("uint8")
+    absmax = np.random.rand((numel + block_size - 1) // block_size).astype(dtype)
+    quant_map_buffer = np.zeros(16).astype(dtype)
+
+    output_d = ke.DeviceArray(output)
+    a_d = ke.DeviceArray(a)
+    b_d = ke.DeviceArray(b)
+    absmax_d = ke.DeviceArray(absmax)
+    quant_map_buffer_d = ke.DeviceArray(quant_map_buffer)
+    f = getattr(ke, func)
+
+    my_op = f(output_d, a_d, b_d, absmax_d, quant_map_buffer_d, quant_enums[qt], m, n, k)
+    duration_ms = my_op.Profile()
+    total_bytes = (m * k + n * k + m * n) * (dtype_to_bytes(dtype))
+
+    ke.report(MatrixFpBnb4Metric(func, dtype, duration_ms, total_bytes, m, n, k, qt))
+
+
+def profile_gemm_func(m, n, k, dtype, func):
+    np.random.seed(0)
+    output = np.random.rand(m, n).astype(dtype)
+    a = np.random.rand(m, k).astype(dtype)
+    b = np.random.rand(k, n).astype(dtype)
+
+    output_d = ke.DeviceArray(output)
+    a_d = ke.DeviceArray(a)
+    b_d = ke.DeviceArray(b)
+    f = getattr(ke, func)
+    my_op = f(output_d, a_d, b_d, m, n, k)
+    duration_ms = my_op.Profile()
+    total_bytes = (m * k + n * k + m * n) * (dtype_to_bytes(dtype))
+
+    ke.report(MatrixMulMetric(func, dtype, duration_ms, total_bytes, m, n, k))
+
+
+def profile_with_args(qt, m, n, k, dtype, sort):
+    with ke.benchmark(sort):
+        for func in dtype_to_funcs(dtype):
+            profile_matmul_fp_bnb4_func(qt, m, n, k, dtype, func)
+
+        for func in dtype_to_funcs_cublas(dtype):
+            profile_gemm_func(m, n, k, dtype, func)
+
+
+def profile():
+    dims_m = [1]
+    for qt in quant_types:
+        for dt in dtypes:
+            for m in dims_m:
+                for n, k in ((4096, 4096), (4096, 12288), (12288, 4096)):
+                    profile_with_args(qt, m, n, k, dt, False)
+                    print()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group("profile with args")
+    group.add_argument("m", type=int)
+    group.add_argument("n", type=int)
+    group.add_argument("k", type=int)
+    group.add_argument("quant_type", choices=quant_types)
+    group.add_argument("dtype", choices=dtypes)
+    group.add_argument("--sort", action="store_true")
+
+    if len(sys.argv) == 1:
+        profile()
+    else:
+        args = parser.parse_args()
+        profile_with_args(args.quant_type, args.m, args.n, args.k, args.dtype, args.sort)
diff --git a/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py b/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py
new file mode 100644
index 0000000000000..951746a089305
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py
@@ -0,0 +1,240 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import argparse
+import logging
+import os
+from typing import List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+import onnx
+from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+
+from onnxruntime.capi._pybind_state import quantize_matmul_bnb4
+
+from .onnx_model import ONNXModel
+from .quant_utils import attribute_to_kwarg
+
+logger = logging.getLogger(__name__)
+
+
+class MatMulBnb4Quantizer:
+    """Perform 4b quantization of constant MatMul weights using FP4 or NF4 data type"""
+
+    ##################
+    # quantization types, must be consistent with native code type
+    # Bnb_DataType_t defined in blockwise_quant_block_bnb4.h
+
+    # 4b floating point with bias of 3
+    FP4 = 0
+
+    # 4b NormalFloat
+    NF4 = 1
+
+    def __init__(self, model: ModelProto, quant_type: int, block_size: int, nodes_to_exclude=None):
+        nodes_to_exclude = nodes_to_exclude or []
+        assert quant_type in [MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4]
+        self.model = ONNXModel(model)
+        self.quant_type = quant_type
+        self.block_size = block_size
+        self.nodes_to_exclude = set(nodes_to_exclude)
+
+    @staticmethod
+    def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
+        for gid in range(len(graph_path) - 1, -1, -1):
+            graph = graph_path[gid]
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return tensor, graph
+        return None, None
+
+    def bnb4_block_quant(self, fpweight: npt.ArrayLike) -> np.ndarray:
+        """4b quantize fp32/fp16 weight"""
+
+        if len(fpweight.shape) != 2:
+            raise ValueError("Current bnb4 block quantization only supports 2D tensors!")
+        # need to copy since the transposed weight still has the original memory layout
+        # Linear4bit quantizes its weight data which is the transposed weight
+        fpweight_t = fpweight.transpose().copy()
+
+        rows, cols = fpweight.shape
+        numel = rows * cols
+        block_size = self.block_size
+        num_blocks = (numel + block_size - 1) // block_size
+        quantized_numel = (numel + 1) // 2
+
+        packed = np.zeros(quantized_numel, dtype="uint8")
+        absmax = np.zeros(num_blocks, dtype=fpweight.dtype)
+        # block wise quantization, fpweight_t is flattened and divided into blocks
+        quantize_matmul_bnb4(packed, fpweight_t, absmax, block_size, self.quant_type, cols, rows)
+
+        return (packed, absmax)
+
+    def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
+        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+
+        if node.op_type != "MatMul":
+            return node  # only care about MatMul for now
+
+        logger.debug(f"start to quantize {node.name} ...")
+        if node.name in self.nodes_to_exclude:
+            logger.debug(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
+            return node
+
+        inputB = node.input[1]  # noqa: N806
+        B, Bs_graph = MatMulBnb4Quantizer.__get_initializer(inputB, graph_stack)  # noqa: N806
+        if B is None:
+            logger.debug("MatMul doesn't have const weight. Skip to quantize")
+            return node  # only care about constant weight
+
+        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
+        if len(B_array.shape) != 2:
+            logger.debug("MatMul weight is not 2D. Skip to quantize")
+            return node  # can only process 2-D matrix
+
+        packed, absmax = self.bnb4_block_quant(B_array)
+        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
+        B_quant.name = B.name + "_Bnb4"
+        for input in Bs_graph.input:
+            if input.name == inputB:
+                Bs_graph.input.remove(input)
+                break
+
+        absmax_tensor = onnx.numpy_helper.from_array(absmax)
+        absmax_tensor.name = B.name + "_absmax"
+
+        Bs_graph.initializer.extend([B_quant, absmax_tensor])
+
+        kwargs = {}
+        rows, cols = B_array.shape
+        kwargs["K"] = rows
+        kwargs["N"] = cols
+        kwargs["block_size"] = self.block_size
+        kwargs["quant_type"] = self.quant_type
+
+        matmul_bnb4_node = onnx.helper.make_node(
+            "MatMulBnb4",
+            inputs=[node.input[0], B_quant.name, absmax_tensor.name],
+            outputs=[node.output[0]],
+            name=node.name + "_Bnb4" if node.name else "",
+            domain="com.microsoft",
+            **kwargs,
+        )
+
+        logger.debug(f"complete quantization of {node.name} ...")
+
+        return matmul_bnb4_node
+
+    def _process_subgraph(self, graph_stack: List[GraphProto]):
+        new_nodes = []
+        graph = graph_stack[-1]
+
+        for node in graph.node:
+            graph_attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(graph_attrs):
+                kwargs = {}
+                for attr in node.attribute:
+                    if attr.type == onnx.AttributeProto.GRAPH:
+                        # recursive call to take care of sub-graph
+                        graph_stack.append(attr.g)
+                        kv = {attr.name: self._process_subgraph(graph_stack)}
+                    elif attr.type == onnx.AttributeProto.GRAPHS:
+                        value = []
+                        for subgraph in attr.graphs:
+                            # recursive call to take care of sub-graph
+                            graph_stack.append(subgraph)
+                            value.extend([self._process_subgraph(graph_stack)])
+                        kv = {attr.name: value}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node = onnx.helper.make_node(  # noqa: PLW2901
+                    node.op_type, node.input, node.output, name=node.name, **kwargs
+                )
+
+            new_nodes.append(self._bnb4_matmul_node_weight(node, graph_stack))
+
+        graph.ClearField("node")
+        graph.node.extend(new_nodes)
+        graph_stack.pop()
+        return graph
+
+    def process(self):
+        # use a stack to keep track of sub-graphs
+        graph_stack = [self.model.graph()]
+        opset_import = self.model.opset_import()
+
+        has_ms_domain = False
+        for opset in opset_import:
+            if opset.domain == "com.microsoft":
+                has_ms_domain = True
+        if not has_ms_domain:
+            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+
+        self._process_subgraph(graph_stack)
+        self.model.clean_initializers()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="""Blockwise FP4/NF4 quantization for MatMul 2D weight matrices.
+
+A weight matrix is partitioned into blocks, where each block is a contiguous
+subset inside the flattened transposed weight matrix. Each block is quantized
+into a set of 4b integers with an absolute value scaling factor.
+"""
+    )
+
+    parser.add_argument("--input_model", required=True, help="Path to the input model file")
+    parser.add_argument("--output_model", required=True, help="Path to the output model file")
+    parser.add_argument(
+        "--quant_type",
+        required=False,
+        default=1,
+        options=[MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],
+        help="Quantization data type. 0: FP4, 1: NF4",
+    )
+    parser.add_argument(
+        "--block_size",
+        required=False,
+        default=64,
+        description="Block size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64",
+    )
+    parser.add_argument("-v", "--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        type=str,
+        required=False,
+        default=[],
+        help="Specify the nodes to be excluded from quantization with node names",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    input_model_path = args.input_model
+    output_model_path = args.output_model
+
+    if os.path.exists(output_model_path):
+        logger.error(f"file {output_model_path} already exists")
+        raise Exception(f"file {output_model_path} already exists")
+
+    model = onnx.load(input_model_path)
+    quant = MatMulBnb4Quantizer(model, args.quant_type, args.block_size, nodes_to_exclude=args.nodes_to_exclude)
+    quant.process()
+    quant.model.save_model_to_file(output_model_path, True)
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 6d954bd540718..272727a9f5375 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -206,9 +206,11 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "PackedAttention": self._infer_PackedAttention,
             "PackedMultiHeadAttention": self._infer_PackedMultiHeadAttention,
             "PythonOp": self._infer_PythonOp,
+            "QuickGelu": self._infer_FastGelu,
             "RelativePositionBias": self._infer_RelativePositionBias,
             "RemovePadding": self._infer_RemovePadding,
             "RestorePadding": self._infer_RestorePadding,
+            "RotaryEmbedding": self._infer_RotaryEmbedding,
             "SimplifiedLayerNormalization": self._infer_LayerNormalization,
             "SkipLayerNormalization": self._infer_SkipLayerNormalization,
             "SkipSimplifiedLayerNormalization": self._infer_SkipLayerNormalization,
@@ -230,7 +232,6 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "upsample_nearest1d": self._infer_aten_upsample,
             "upsample_nearest2d": self._infer_aten_upsample,
             "upsample_nearest3d": self._infer_aten_upsample,
-            "upsample_bilinear2d": self._infer_aten_upsample,
         }
         self.run_ = True
         self.suggested_merge_ = {}
@@ -463,6 +464,8 @@ def _onnx_infer_single_node(self, node):
             "BiasSplitGelu",
             "BiasAdd",
             "NhwcConv",
+            "QuickGelu",
+            "RotaryEmbedding",
         ]
 
         if not skip_infer:
@@ -2308,6 +2311,9 @@ def _infer_FastGelu(self, node):  # noqa: N802
     def _infer_Gelu(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
+    def _infer_QuickGelu(self, node):  # noqa: N802
+        self._propagate_shape_and_type(node)
+
     def _infer_GemmFastGelu(self, node):  # noqa: N802
         self._compute_matmul_shape(node)
 
@@ -2379,6 +2385,19 @@ def _infer_BiasSplitGelu(self, node):  # noqa: N802
     def _infer_BiasAdd(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
+    def _infer_RotaryEmbedding(self, node):  # noqa: N802
+        if len(node.output) == 1:
+            self._propagate_shape_and_type(node)
+        elif len(node.output) == 2:
+            # Extraneous constant nodes outputted by RotaryEmbedding function made with `export_modules_as_functions`
+            self._propagate_shape_and_type(node, input_index=1, output_index=0)
+            self._propagate_shape_and_type(node, input_index=0, output_index=1)  # true output
+        elif len(node.output) == 3:
+            # Extraneous constant nodes outputted by RotaryEmbedding function made with `export_modules_as_functions`
+            self._propagate_shape_and_type(node, input_index=1, output_index=0)
+            self._propagate_shape_and_type(node, input_index=1, output_index=1)
+            self._propagate_shape_and_type(node, input_index=0, output_index=2)  # true output
+
     def _infer_PythonOp(self, node):  # noqa: N802
         output_tensor_types = get_attribute(node, "output_tensor_types")
         assert output_tensor_types
@@ -2584,12 +2603,19 @@ def get_prereq(node):
                         self._check_merged_dims(in_dims, allow_broadcast=True)
 
             for i_o in range(len(node.output)):
-                # Special case: We do not care about the training related
-                # outputs of SkipLayerNormalization
+                # Special cases:
+                # 1) We do not care about the training related outputs of SkipLayerNormalization
+                # 2) We do not care about the extraneous constant outputs in RotaryEmbedding because
+                # the RotaryEmbedding op created during export can be replaced by the RotaryEmbedding
+                # contrib op
                 if (
                     node.op_type == "SkipLayerNormalization" or node.op_type == "SkipSimplifiedLayerNormalization"
                 ) and i_o in [1, 2]:
                     continue
+                if node.op_type == "RotaryEmbedding" and len(node.output) > 1:
+                    # Skip symbolic shape inference for RotaryEmbedding functions that have extraneous outputs
+                    # generated by `export_modules_as_functions`
+                    continue
 
                 vi = self.known_vi_[node.output[i_o]]
                 out_type = vi.type
@@ -2751,13 +2777,13 @@ def get_prereq(node):
                             if i in self.known_vi_:
                                 logger.debug(self.known_vi_[i])
                             else:
-                                logger.debug(f"not in knwon_vi_ for {i}")
+                                logger.debug(f"not in known_vi_ for {i}")
                         logger.debug("node outputs:")
                         for o in node.output:
                             if o in self.known_vi_:
                                 logger.debug(self.known_vi_[o])
                             else:
-                                logger.debug(f"not in knwon_vi_ for {o}")
+                                logger.debug(f"not in known_vi_ for {o}")
                         if self.auto_merge_ and not out_type_undefined:
                             logger.debug("Merging: " + str(self.suggested_merge_))
                     return False
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index 4f898245d01bd..b6f7a44450c62 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -33,6 +33,7 @@ class Precision(Enum):
     FLOAT32 = "fp32"
     FLOAT16 = "fp16"
     INT8 = "int8"
+    INT4 = "int4"
 
     def __str__(self):
         return self.value
@@ -610,7 +611,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None):
         return memory_before_test
 
     with ThreadPoolExecutor() as executor:
-        monitor = MemoryMonitor()
+        monitor = memory_monitor_type()
         mem_thread = executor.submit(monitor.measure_cpu_usage)
         try:
             fn_thread = executor.submit(func)
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index c1c709d6d759b..4228c892d03ae 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -1272,6 +1272,38 @@ def find_past_seq_len_usage(subg: GraphProto):
     return tensor_names_to_rename, nodes_to_remove
 
 
+def replace_mha_with_gqa(model: OnnxModel, past_seq_len_input: str, kv_num_heads: int = 0):
+    past_seq_len = past_seq_len_input
+    if past_seq_len not in model.get_graphs_input_names():
+        # Replace model input for past sequence length
+        new_input = onnx.helper.make_tensor_value_info(past_seq_len, onnx.TensorProto.INT64, shape=[1])
+        model.model.graph.input.append(new_input)
+
+    # Replace MultiHeadAttention with GroupQueryAttention
+    for node in model.model.graph.node:
+        if node.op_type == "MultiHeadAttention":
+            gqa_node = onnx.helper.make_node(
+                "GroupQueryAttention",
+                inputs=[
+                    node.input[0],  # query
+                    node.input[1],  # key
+                    node.input[2],  # value
+                    node.input[6],  # past_key
+                    node.input[7],  # past_value
+                    past_seq_len,  # past_sequence_length
+                ],
+                outputs=node.output,
+                name=node.name.replace("MultiHeadAttention", "GroupQueryAttention"),
+                domain="com.microsoft",
+                num_heads=node.attribute[0].i,
+                kv_num_heads=node.attribute[0].i if kv_num_heads == 0 else kv_num_heads,
+                is_past_bsnh=0,
+            )
+            model.model.graph.node.remove(node)
+            model.model.graph.node.extend([gqa_node])
+    return model
+
+
 def update_decoder_subgraph_output_cross_attention(subg: GraphProto):
     input_self_past_0 = 1
     # w/wo attention mask, w/wo hidden_state
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 1dbdf39613cdd..c1b241aa1a5ec 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -111,7 +111,7 @@ def __init__(
         model: OnnxModel,
         hidden_size: int,
         num_heads: int,
-        attention_mask: AttentionMask,
+        attention_mask: Optional[AttentionMask] = None,
         use_multi_head_attention: bool = False,
         disable_multi_head_attention_bias: bool = False,
         search_op_types: List[str] = ["SkipLayerNormalization", "LayerNormalization"],  # noqa: B006
@@ -120,7 +120,7 @@ def __init__(
         super().__init__(model, attention_op_name, search_op_types)
         self.hidden_size = hidden_size
         self.num_heads = num_heads
-        self.attention_mask = attention_mask
+        self.attention_mask = attention_mask if attention_mask else AttentionMask(model)
         self.use_multi_head_attention = use_multi_head_attention
         self.disable_multi_head_attention_bias = disable_multi_head_attention_bias
         self.mask_filter_value = None
@@ -219,6 +219,31 @@ def get_add_qk_str(self, add_qk: NodeProto):
 
         return add_qk.input[1]
 
+    def reshape_add_qk(self, add_qk: str):
+        # Convert 4D mask from (B,1,S,T) to (B,N,S,T)
+        # B = batch size, N = num heads, S = source sequence length, T = target sequence length
+        mask_output_name = add_qk + "_mask"
+
+        # Check if concat node for (B,1,S,T) --> (B,N,S,T) already exists
+        concat_node = list(filter(lambda node: node.output[0] == mask_output_name, self.nodes_to_add))
+        if len(concat_node) == 1:
+            return mask_output_name
+
+        assert len(concat_node) == 0
+        concat_node_name = self.model.create_node_name("Concat")
+        concat_add_qk_fp32 = helper.make_node(
+            "Concat",
+            inputs=[add_qk for _ in range(self.num_heads)],
+            outputs=[mask_output_name],
+            name=concat_node_name,
+            axis=1,
+        )
+        # Add new node to graph
+        self.nodes_to_add.append(concat_add_qk_fp32)
+        self.node_name_to_graph_name[concat_node_name] = self.this_graph_name
+
+        return mask_output_name
+
     def concat_kv(self, past_k: str, past_v: str) -> str:
         """Concatenate past_k and past_v inputs to create past_kv input.
 
@@ -875,21 +900,8 @@ def create_attention_node(
                 past_kv = self.concat_kv(past_k, past_v)
                 attention_inputs.append(past_kv)
 
-            if add_qk_str:
-                # Convert 4d mask from (B,1,M,M) to (B,N,M,M)
-                # B = batch size, M = max sequence length, N = num heads
-                concat_node_name = self.model.create_node_name("Concat")
-                mask_output_name = add_qk_str + "_mask"
-                concat_add_qk_fp32 = helper.make_node(
-                    "Concat",
-                    inputs=[add_qk_str for _ in range(num_heads)],
-                    outputs=[mask_output_name],
-                    name=concat_node_name,
-                    axis=1,
-                )
-                # Add new nodes to graph
-                self.nodes_to_add.append(concat_add_qk_fp32)
-                self.node_name_to_graph_name[concat_node_name] = self.this_graph_name
+            if add_qk_str is not None:
+                mask_output_name = self.reshape_add_qk(add_qk_str)
 
                 # Add attention mask to attention node
                 if not past_exists:
diff --git a/onnxruntime/python/tools/transformers/fusion_base.py b/onnxruntime/python/tools/transformers/fusion_base.py
index 117468be412fa..c5d7bc16d64f7 100644
--- a/onnxruntime/python/tools/transformers/fusion_base.py
+++ b/onnxruntime/python/tools/transformers/fusion_base.py
@@ -113,3 +113,20 @@ def add_initializer(self, name: str, data_type: int, dims: Sequence[int], vals:
 
         self.model.add_initializer(tensor, self.this_graph_name)
         return tensor
+
+    def add_nodes_to_remove(self, nodes: List[NodeProto]):
+        # Some nodes are shared between paths (e.g. rotary embedding nodes in the Q and K paths).
+        # When path A is fused, its shared nodes are added to `self.nodes_to_remove`. But when path B
+        # is fused, its shared nodes are also added to `self.nodes_to_remove`. When the nodes are
+        # iteratively removed from `self.nodes_to_remove`, path A's shared nodes are removed first.
+        # Since path A's shared nodes are removed, path B's shared nodes are not removed because they
+        # were previously removed for path A. This causes an error to print in remove_node that a node
+        # has failed to be removed.
+        #
+        # To avoid this error, we pre-emptively check if the shared nodes are already in `self.nodes_to_remove`.
+        # We could alternatively convert `self.nodes_to_remove` to a set to avoid this issue, but there could
+        # be scenarios where the nodes need to be removed in a specific order and converting to a set would
+        # lose this order.
+        for node in nodes:
+            if node not in self.nodes_to_remove:
+                self.nodes_to_remove.append(node)
diff --git a/onnxruntime/python/tools/transformers/fusion_options.py b/onnxruntime/python/tools/transformers/fusion_options.py
index 69b5cd26f4525..8c80fcad0ab49 100644
--- a/onnxruntime/python/tools/transformers/fusion_options.py
+++ b/onnxruntime/python/tools/transformers/fusion_options.py
@@ -26,6 +26,7 @@ def __init__(self, model_type):
         self.enable_gelu = True
         self.enable_layer_norm = True
         self.enable_attention = True
+        self.enable_rotary_embeddings = True
 
         # Use MultiHeadAttention instead of Attention operator. The difference:
         # (1) Attention has merged weights for Q/K/V projection, which might be faster in some cases since 3 MatMul is
@@ -81,6 +82,8 @@ def parse(args):
             options.enable_gelu = False
         if args.disable_layer_norm:
             options.enable_layer_norm = False
+        if args.disable_rotary_embeddings:
+            options.enable_rotary_embeddings = False
         if args.disable_attention:
             options.enable_attention = False
         if args.use_multi_head_attention:
@@ -294,3 +297,10 @@ def add_arguments(parser: ArgumentParser):
             help="Use channels_first (NCHW) instead of channels_last (NHWC) for GroupNorm. Only works for model_type=unet or vae",
         )
         parser.set_defaults(use_group_norm_channels_first=False)
+
+        parser.add_argument(
+            "--disable_rotary_embeddings",
+            required=False,
+            action="store_true",
+            help="Do not fuse rotary embeddings into RotaryEmbedding op",
+        )
diff --git a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
new file mode 100644
index 0000000000000..3c5029ac5752f
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
@@ -0,0 +1,1044 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import logging
+from typing import Optional, Union
+
+from fusion_attention import FusionAttention
+from fusion_base import Fusion
+from onnx import FunctionProto, NodeProto, TensorProto, helper, numpy_helper
+from onnx_model import OnnxModel
+
+logger = logging.getLogger(__name__)
+
+
+class FusionRotaryAttention(FusionAttention):
+    """
+    Fuse Attention subgraph with rotary positional embeddings into one MultiHeadAttention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+    ):
+        super().__init__(
+            model,
+            hidden_size,
+            num_heads,
+            use_multi_head_attention=True,
+            search_op_types=["SimplifiedLayerNormalization", "SkipSimplifiedLayerNormalization", "Add"],
+        )
+
+    def create_mha_node(
+        self,
+        input: str,
+        output: str,
+        q_rotary: NodeProto,
+        k_rotary: NodeProto,
+        v_matmul: NodeProto,
+        attn_mask: str = "",
+        add_qk: str = "",
+        past_k: str = "",
+        past_v: str = "",
+        present_k: str = "",
+        present_v: str = "",
+        scale: Optional[float] = None,
+    ) -> Union[NodeProto, None]:
+        assert self.num_heads > 0
+
+        if self.hidden_size > 0 and (self.hidden_size % self.num_heads) != 0:
+            logger.debug(
+                f"fuse_rotary_attention: input hidden size {self.hidden_size} is not a multiple of num of heads {self.num_heads}"
+            )
+            return None
+
+        mha_node_name = self.model.create_node_name("MultiHeadAttention")
+        mha_inputs = [
+            q_rotary.output[0],
+            k_rotary.output[0],
+            v_matmul.output[0],
+            "",  # bias
+            attn_mask,  # key_padding_mask
+            add_qk,  # relative_position_bias
+            past_k,
+            past_v,
+        ]
+
+        mha_outputs = [output]
+        if present_k and present_v:
+            mha_outputs.extend([present_k, present_v])
+
+        mha_node = helper.make_node(
+            "MultiHeadAttention",
+            inputs=mha_inputs,
+            outputs=mha_outputs,
+            name=mha_node_name,
+        )
+
+        mha_node.domain = "com.microsoft"
+        mha_node.attribute.extend([helper.make_attribute("num_heads", self.num_heads)])
+        if scale is not None:
+            mha_node.attribute.extend([helper.make_attribute("scale", scale)])
+        if self.mask_filter_value is not None:
+            mha_node.attribute.extend([helper.make_attribute("mask_filter_value", float(self.mask_filter_value))])
+
+        self.increase_counter("MultiHeadAttention")
+        return mha_node
+
+    def check_runtime_shape_paths_for_function(
+        self,
+        reshape_qkv_2,  # Reshape after Transpose
+        reshape_qkv_1,  # Reshape before Transpose
+        reshape_q_2,  # Reshape after RotaryEmbedding
+        reshape_k_2,  # Reshape after RotaryEmbedding
+        reshape_v_2,  # Reshape after Transpose
+        reshape_v_1,  # Reshape before Transpose
+        add_qk,  # Add before Softmax
+        root_input,  # Root input to attention subgraph
+    ):
+        # Check #1: check paths for qkv nodes
+        concat_qkv_2_path = self.model.match_parent_path(reshape_qkv_2, ["Concat"], [1])
+        concat_qkv_1_path = self.model.match_parent_path(reshape_qkv_1, ["Concat"], [1])
+        if concat_qkv_2_path is None or concat_qkv_1_path is None:
+            return False
+        concat_qkv_2, concat_qkv_1 = concat_qkv_2_path[0], concat_qkv_1_path[0]
+
+        reshape_qkv_2_path_1 = self.model.match_parent_path(concat_qkv_2, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0])
+        reshape_qkv_2_path_2 = self.model.match_parent_path(concat_qkv_2, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0])
+        reshape_qkv_1_path_1 = self.model.match_parent_path(concat_qkv_1, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0])
+        reshape_qkv_1_path_2 = self.model.match_parent_path(concat_qkv_1, ["Unsqueeze", "Gather", "Shape"], [2, 0, 0])
+        if (
+            reshape_qkv_2_path_1 is None
+            or reshape_qkv_2_path_2 is None
+            or reshape_qkv_1_path_1 is None
+            or reshape_qkv_1_path_2 is None
+        ):
+            return False
+
+        _, gather_1, shape_1 = reshape_qkv_2_path_1
+        _, gather_2, shape_2 = reshape_qkv_2_path_2
+
+        # Check root_input --> Shape --> Gather connection
+        if shape_1.input[0] != root_input or shape_2.input[0] != root_input:
+            return False
+
+        # Check Gather --> Unsqueeze --> Concat --> Reshape connection for reshape_qkv_1_path_1 and reshape_qkv_1_path_2
+        if reshape_qkv_1_path_1[1].name != gather_1.name or reshape_qkv_1_path_2[1].name != gather_2.name:
+            return False
+
+        # Check #2: check paths for v nodes
+        concat_v_2_path = self.model.match_parent_path(reshape_v_2, ["Concat"], [1])
+        concat_v_1_path = self.model.match_parent_path(reshape_v_1, ["Concat"], [1])
+        if concat_v_2_path is None or concat_v_1_path is None:
+            return False
+        concat_v_2, concat_v_1 = concat_v_2_path[0], concat_v_1_path[0]
+
+        reshape_v_2_path_1 = self.model.match_parent_path(
+            concat_v_2, ["Unsqueeze", "Mul", "Gather", "Shape"], [0, 0, 0, 0]
+        )
+        reshape_v_2_path_2 = self.model.match_parent_path(
+            concat_v_2, ["Unsqueeze", "Add", "Gather", "Shape"], [1, 0, 0, 0]
+        )
+        reshape_v_1_path_1 = self.model.match_parent_path(concat_v_1, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0])
+        reshape_v_1_path_2 = self.model.match_parent_path(concat_v_1, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0])
+        if (
+            reshape_v_2_path_1 is None
+            or reshape_v_2_path_2 is None
+            or reshape_v_1_path_1 is None
+            or reshape_v_1_path_2 is None
+        ):
+            return False
+
+        # Check Gather --> Mul --> Unsqueeze --> Concat --> Reshape connection for reshape_v_2_path_1
+        # Check Gather --> Add --> Unsqueeze --> Concat --> Reshape connection for reshape_v_2_path_2
+        # Check Gather --> Unsqueeze --> Concat --> Reshape connection for reshape_v_1_path_1 and reshape_v_1_path_2
+        if (
+            reshape_v_2_path_1[2].name != gather_1.name
+            or reshape_v_2_path_2[2].name != gather_2.name
+            or reshape_v_1_path_1[1].name != gather_1.name
+            or reshape_v_1_path_2[1].name != gather_2.name
+        ):
+            return False
+
+        # Check #3: check paths for k nodes
+        concat_k_2_path = self.model.match_parent_path(reshape_k_2, ["Concat"], [1])
+        if concat_k_2_path is None:
+            return False
+        concat_k_2 = concat_k_2_path[0]
+
+        reshape_k_2_path_1 = self.model.match_parent_path(
+            concat_k_2, ["Unsqueeze", "Mul", "Gather", "Shape"], [0, 0, 0, 0]
+        )
+        reshape_k_2_path_2 = self.model.match_parent_path(
+            concat_k_2, ["Unsqueeze", "Add", "Gather", "Shape"], [2, 0, 0, 0]
+        )
+        if reshape_k_2_path_1 is None or reshape_k_2_path_2 is None:
+            return False
+
+        # Check Gather --> Mul --> Unsqueeze --> Concat --> Reshape connection for reshape_k_2_path_1
+        # Check Gather --> Add --> Unsqueeze --> Concat --> Reshape connection for reshape_k_2_path_2
+        if reshape_k_2_path_1[2].name != gather_1.name or reshape_k_2_path_2[2].name != gather_2.name:
+            return False
+
+        # Check #4: check paths for q nodes
+        concat_q_2_path = self.model.match_parent_path(reshape_q_2, ["Concat"], [1])
+        if concat_q_2_path is None:
+            return False
+        concat_q_2 = concat_q_2_path[0]
+
+        reshape_q_2_path_1 = self.model.match_parent_path(
+            concat_q_2, ["Unsqueeze", "Mul", "Gather", "Shape"], [0, 0, 0, 0]
+        )
+        reshape_q_2_path_2 = self.model.match_parent_path(concat_q_2, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0])
+        if reshape_q_2_path_1 is None or reshape_q_2_path_2 is None:
+            return False
+
+        # Check Gather --> Mul --> Unsqueeze --> Concat --> Reshape connection for reshape_q_2_path_1
+        # Check Gather --> Unsqueeze --> Concat --> Reshape connection for reshape_q_2_path_2
+        if reshape_q_2_path_1[2].name != gather_1.name or reshape_q_2_path_2[1].name != gather_2.name:
+            return False
+
+        # Check #5: check Mul nodes are the same for q, k, v
+        mul_q = reshape_q_2_path_1[1]
+        mul_k = reshape_k_2_path_1[1]
+        mul_v = reshape_v_2_path_1[1]
+        gather_1_out = gather_1.output[0]
+        if mul_q.input[0] != gather_1_out or mul_k.input[0] != gather_1_out or mul_v.input[0] != gather_1_out:
+            return False
+
+        # Check #6: check paths for attention mask nodes
+        attn_mask_path_1 = self.model.match_parent_path(add_qk, ["Concat", "Slice", "Slice"], [1, 0, 0])
+        attn_mask_path_2 = self.model.match_parent_path(add_qk, ["Cast", "Concat", "Slice", "Slice"], [1, 0, 0, 0])
+        if attn_mask_path_1 is not None:
+            _, slice_qk_2, slice_qk_1 = attn_mask_path_1
+        elif attn_mask_path_2 is not None:
+            _, _, slice_qk_2, slice_qk_1 = attn_mask_path_2
+        else:
+            return False
+        # Check first input to Slice #1 is 3D attention mask of shape (B,S,T)
+        if slice_qk_1.input[0] not in {"attn_mask", "attention_mask"}:
+            return False
+
+        slice_qk_2_path = self.model.match_parent_path(
+            slice_qk_2, ["Unsqueeze", "Add", "Gather", "Shape"], [2, 0, 1, 0]
+        )
+        slice_qk_1_path_1 = self.model.match_parent_path(
+            slice_qk_1, ["Unsqueeze", "Add", "Gather", "Shape"], [2, 0, 1, 0]
+        )
+        slice_qk_1_path_2 = self.model.match_parent_path(slice_qk_1, ["Unsqueeze"], [1])
+        if slice_qk_2_path is None or slice_qk_1_path_1 is None or slice_qk_1_path_2 is None:
+            return False
+
+        # Check Gather --> Add --> Unsqueeze #3 --> Slice #2 connection for slice_qk_2_path
+        # Check Gather --> Add --> Unsqueeze #2 --> Slice #1 connection for slice_qk_1_path_1
+        if slice_qk_2_path[1].name != slice_qk_1_path_1[1].name or slice_qk_2_path[2].name != slice_qk_1_path_1[2].name:
+            return False
+
+        # Check Unsqueeze #1 --> Slice #1 connection for slice_qk_1_path_2
+        # Check if first input to Add and Unsqueeze #1 is position ids
+        if slice_qk_1_path_1[1].input[0] != slice_qk_1_path_2[0].input[0]:
+            return False
+
+        return True
+
+    def check_runtime_shape_paths_for_nodes(
+        self,
+        reshape_qkv,  # Final reshape before o_proj MatMul
+        reshape_q,  # Reshape before q_proj MatMul
+        reshape_k,  # Reshape before k_proj MatMul
+        reshape_v,  # Reshape before v_proj MatMul
+        root_input,  # Root input to attention subgraph
+    ):
+        # Check #1: check paths for qkv nodes
+        concat_qkv_path = self.model.match_parent_path(reshape_qkv, ["Concat"], [1])
+        if concat_qkv_path is None:
+            return False
+        concat_qkv = concat_qkv_path[0]
+
+        reshape_qkv_path_1 = self.model.match_parent_path(concat_qkv, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0])
+        reshape_qkv_path_2 = self.model.match_parent_path(concat_qkv, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0])
+        if reshape_qkv_path_1 is None or reshape_qkv_path_2 is None:
+            return False
+
+        _, gather_1, shape_1 = reshape_qkv_path_1
+        _, gather_2, shape_2 = reshape_qkv_path_2
+
+        # Check root_input --> Shape --> Gather connection
+        if shape_1.input[0] != root_input or shape_2.input[0] != root_input:
+            return False
+
+        # Check #2: check paths for v nodes
+        concat_v_path = self.model.match_parent_path(reshape_v, ["Concat"], [1])
+        if concat_v_path is None:
+            return False
+        concat_v = concat_v_path[0]
+
+        reshape_v_path_1 = self.model.match_parent_path(concat_v, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0])
+        reshape_v_path_2 = self.model.match_parent_path(concat_v, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0])
+        if reshape_v_path_1 is None or reshape_v_path_2 is None:
+            return False
+
+        # Check Gather --> Unsqueeze --> Concat --> Reshape connection
+        if reshape_v_path_1[1].name != gather_1.name or reshape_v_path_2[1].name != gather_2.name:
+            return False
+
+        # Check #3: check paths for k nodes
+        concat_k_path = self.model.match_parent_path(reshape_k, ["Concat"], [1])
+        if concat_k_path is None:
+            return False
+        concat_k = concat_k_path[0]
+
+        reshape_k_path_1 = self.model.match_parent_path(concat_k, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0])
+        reshape_k_path_2 = self.model.match_parent_path(concat_k, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0])
+        if reshape_k_path_1 is None or reshape_k_path_2 is None:
+            return False
+
+        # Check Gather --> Unsqueeze --> Concat --> Reshape connection
+        if reshape_k_path_1[1].name != gather_1.name or reshape_k_path_2[1].name != gather_2.name:
+            return False
+
+        # Check #4: check paths for q nodes
+        concat_q_path = self.model.match_parent_path(reshape_q, ["Concat"], [1])
+        if concat_q_path is None:
+            return False
+        concat_q = concat_q_path[0]
+
+        reshape_q_path_1 = self.model.match_parent_path(concat_q, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0])
+        reshape_q_path_2 = self.model.match_parent_path(concat_q, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0])
+        if reshape_q_path_1 is None or reshape_q_path_2 is None:
+            return False
+
+        # Check Gather --> Unsqueeze --> Concat --> Reshape connection
+        if reshape_q_path_1[1].name != gather_1.name or reshape_q_path_2[1].name != gather_2.name:
+            return False
+
+        return True
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        if normalize_node.op_type != "SkipSimplifiedLayerNormalization" and normalize_node.op_type != "Add":
+            return
+
+        # qkv_nodes_1 is for LLaMA-2 Microsoft
+        # qkv_nodes_2 is for LLaMA-2 Hugging Face
+        qkv_nodes = None
+        qkv_nodes_1 = self.model.match_parent_path(
+            normalize_node,
+            ["MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
+            [1, 0, 0, 0, 0],
+        )
+        qkv_nodes_2 = self.model.match_parent_path(
+            normalize_node,
+            ["MatMul", "Reshape", "Transpose", "MatMul"],
+            [1, 0, 0, 0],
+        )
+        if qkv_nodes_1 is not None:
+            _, reshape_qkv_2, _, reshape_qkv_1, matmul_qkv = qkv_nodes_1
+            qkv_nodes = qkv_nodes_1
+        elif qkv_nodes_2 is not None:
+            _, reshape_qkv, _, matmul_qkv = qkv_nodes_2
+            qkv_nodes = qkv_nodes_2
+        else:
+            logger.debug("fuse_rotary_attention: failed to match qkv nodes")
+            return
+
+        # v_nodes_1 is for LLaMA-2 Microsoft
+        # v_nodes_3 is for LLaMA-2 Hugging Face
+        past_v, present_v, past_seq_len = "", "", ""
+        v_nodes = None
+        v_nodes_1 = self.model.match_parent_path(
+            matmul_qkv,
+            ["Reshape", "Transpose", "Concat", "Transpose", "Reshape", "MatMul"],
+            [1, 0, 0, 1, 0, 0],
+        )
+        v_nodes_2 = self.model.match_parent_path(
+            matmul_qkv,
+            ["Concat", "Transpose", "Reshape", "MatMul"],
+            [1, 1, 0, 0],
+        )
+        v_nodes_3 = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "MatMul"],
+            [1, 0, 0],
+        )
+        if v_nodes_1 is not None:
+            reshape_v_2, _, concat_v, _, reshape_v_1, matmul_v = v_nodes_1
+            v_nodes = v_nodes_1
+
+            concat_v_path = self.model.match_parent_path(
+                concat_v,
+                ["Slice", "Unsqueeze"],
+                [0, 2],
+            )
+            if concat_v_path is None:
+                logger.debug("fuse_rotary_attention: failed to match past/present concat in v path")
+                return
+
+            past_v = concat_v_path[0].input[0]
+            past_seq_len = concat_v_path[-1].input[0]
+            present_v = concat_v.output[0]
+        elif v_nodes_2 is not None:
+            concat_v, transpose_v, reshape_v, matmul_v = v_nodes_2
+            v_nodes = v_nodes_2
+            past_v = concat_v.input[0]
+            present_v = concat_v.output[0]
+        elif v_nodes_3 is not None:
+            transpose_v, reshape_v, matmul_v = v_nodes_3
+            v_nodes = v_nodes_3
+            present_v = transpose_v.output[0]
+        else:
+            logger.debug("fuse_rotary_attention: failed to match v path")
+            return
+
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Softmax", "Add", "Div", "MatMul"],
+            [0, 0, 0, 0],
+        )
+        add_qk, matmul_qk = None, None
+        if qk_nodes is not None:
+            _, add_qk, _, matmul_qk = qk_nodes
+        else:
+            logger.debug("fuse_rotary_attention: failed to match qk nodes")
+            return
+
+        # attn_mask_nodes_1, attn_mask_nodes_2 are for LLaMA-2 Microsoft's 3D attention mask
+        # attn_mask_nodes_3, attn_mask_nodes_4 are for LLaMA-2 Hugging Face's 2D attention mask
+        attn_mask, add_qk_str = "", ""
+        attn_mask_nodes_1 = self.model.match_parent_path(
+            add_qk,
+            ["Concat", "Slice", "Slice"],
+            [1, 0, 0],
+        )
+        attn_mask_nodes_2 = self.model.match_parent_path(
+            add_qk,
+            ["Cast", "Concat", "Slice", "Slice"],
+            [1, 0, 0, 0],
+        )
+        attn_mask_nodes_3 = self.model.match_parent_path(
+            add_qk,
+            ["Add", "Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
+            [1, 0, 2, 1, 0, 0, 0],
+        )
+        attn_mask_nodes_4 = self.model.match_parent_path(
+            add_qk,
+            ["Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
+            [1, 2, 1, 0, 0, 0],
+        )
+        if attn_mask_nodes_1 is not None:
+            _, slice_mask_1, slice_mask_2 = attn_mask_nodes_1
+            attn_mask = slice_mask_1.output[0]
+        elif attn_mask_nodes_2 is not None:
+            _, _, slice_mask_1, slice_mask_2 = attn_mask_nodes_2
+            attn_mask = slice_mask_1.output[0]
+        elif attn_mask_nodes_3 is not None:
+            # Reshape from (B,1,S,T) to (B,N,S,T)
+            add_qk_str = self.reshape_add_qk(attn_mask_nodes_3[0].output[0])
+        elif attn_mask_nodes_4 is not None:
+            # Reshape from (B,1,S,T) to (B,N,S,T)
+            add_qk_str = self.reshape_add_qk(attn_mask_nodes_4[0].output[0])
+        else:
+            logger.debug("fuse_rotary_attention: failed to match attention mask nodes")
+            return
+
+        # k_nodes_1 is for LLaMA-2 Microsoft
+        # k_nodes_2 is for LLaMA-2 Hugging Face
+        past_k, present_k = "", ""
+        k_nodes = None
+        k_nodes_1 = self.model.match_parent_path(
+            matmul_qk,
+            ["Reshape", "Transpose", "Concat", "Transpose", "RotaryEmbedding", "MatMul"],
+            [1, 0, 0, 1, 0, 0],
+        )
+        k_nodes_2 = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "RotaryEmbedding", "Transpose", "Reshape", "MatMul"],
+            [1, 0, 0, 0, 0],
+        )
+        k_nodes_3 = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul"],
+            [1, 0, 1, 0, 0, 0],
+        )
+        if k_nodes_1 is not None:
+            reshape_k_2, _, concat_k, _, rotary_k, matmul_k = k_nodes_1
+            k_nodes = k_nodes_1
+
+            concat_k_path = self.model.match_parent_path(
+                concat_k,
+                ["Slice", "Unsqueeze"],
+                [0, 2],
+            )
+            if concat_k_path is None:
+                logger.debug("fuse_rotary_attention: failed to match past/present concat in k path")
+                return
+
+            past_k = concat_k_path[0].input[0]
+            shared_past_seq_len = concat_k_path[-1].input[0]
+            present_k = concat_k.output[0]
+
+            assert past_seq_len == shared_past_seq_len
+        elif k_nodes_2 is not None:
+            _, rotary_k, _, reshape_k, matmul_k = k_nodes_2
+            k_nodes = k_nodes_2
+            present_k = rotary_k.output[0]
+        elif k_nodes_3 is not None:
+            _, concat_k, rotary_k, _, reshape_k, matmul_k = k_nodes_3
+            k_nodes = k_nodes_3
+            past_k = concat_k.input[0]
+            present_k = concat_k.output[0]
+        else:
+            logger.debug("fuse_rotary_attention: failed to match k nodes")
+            return
+
+        # q_nodes_1 is for LLaMA-2 Microsoft
+        # q_nodes_2 is for LLaMA-2 Hugging Face
+        q_nodes = None
+        q_nodes_1 = self.model.match_parent_path(
+            matmul_qk,
+            ["Reshape", "Transpose", "RotaryEmbedding", "MatMul"],
+            [0, 0, 0, 0],
+        )
+        q_nodes_2 = self.model.match_parent_path(
+            matmul_qk,
+            ["RotaryEmbedding", "Transpose", "Reshape", "MatMul"],
+            [0, 0, 0, 0],
+        )
+        if q_nodes_1 is not None:
+            reshape_q_2, _, rotary_q, matmul_q = q_nodes_1
+            q_nodes = q_nodes_1
+        elif q_nodes_2 is not None:
+            rotary_q, _, reshape_q, matmul_q = q_nodes_2
+            q_nodes = q_nodes_2
+        else:
+            logger.debug("fuse_rotary_attention: failed to match q nodes")
+            return
+
+        if matmul_q.input[0] != matmul_k.input[0] and matmul_k.input[0] != matmul_v.input[0]:
+            logger.debug("fuse_rotary_attention: failed to find the same root_input for q, k, v paths")
+            return
+
+        root_output = ""
+        if qkv_nodes == qkv_nodes_1:
+            if not self.check_runtime_shape_paths_for_function(
+                reshape_qkv_2,
+                reshape_qkv_1,
+                reshape_q_2,
+                reshape_k_2,
+                reshape_v_2,
+                reshape_v_1,
+                add_qk,
+                matmul_q.input[0],
+            ):
+                logger.debug("fuse_rotary_attention: failed to verify runtime shape paths")
+                return
+            root_output = reshape_qkv_2.output[0]
+
+        elif qkv_nodes == qkv_nodes_2:
+            if not self.check_runtime_shape_paths_for_nodes(
+                reshape_qkv,
+                reshape_q,
+                reshape_k,
+                reshape_v,
+                matmul_q.input[0],
+            ):
+                logger.debug("fuse_rotary_attention: failed to verify runtime shape paths")
+                return
+            root_output = reshape_qkv.output[0]
+
+            # Rename inputs of rotary_q/k so it connects with output of matmul_q/k
+            # Before: MatMul --> Reshape --> Transpose --> RotaryEmbedding
+            # After: MatMul --> RotaryEmbedding
+            rotary_q.input[0] = matmul_q.output[0]
+            rotary_k.input[0] = matmul_k.output[0]
+
+            # Rename current output of rotary_k (present_key) so it doesn't match output of MHA (present_key)
+            rotary_k.output[0] = rotary_k.name + "_output_0"
+
+        new_node = self.create_mha_node(
+            matmul_q.input[0],
+            root_output,
+            rotary_q,
+            rotary_k,
+            matmul_v,
+            attn_mask,
+            add_qk_str,
+            past_k,
+            past_v,
+            present_k,
+            present_v,
+        )
+        if new_node is None:
+            logger.debug("fuse_rotary_attention: failed to create multi-head attention with rotary embeddings")
+            return
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+        self.nodes_to_remove.extend(qkv_nodes[1:])
+        self.nodes_to_remove.extend(v_nodes[:-1])
+        self.nodes_to_remove.extend(qk_nodes)
+
+        if k_nodes == k_nodes_1:
+            self.nodes_to_remove.extend(k_nodes[:-2])
+        elif k_nodes == k_nodes_2:
+            self.nodes_to_remove.append(k_nodes[0])
+            self.nodes_to_remove.append(k_nodes[2])
+            self.nodes_to_remove.append(k_nodes[3])
+        elif k_nodes == k_nodes_3:
+            self.nodes_to_remove.append(k_nodes[0])
+            self.nodes_to_remove.append(k_nodes[1])
+            self.nodes_to_remove.append(k_nodes[3])
+            self.nodes_to_remove.append(k_nodes[4])
+
+        if q_nodes == q_nodes_1:
+            self.nodes_to_remove.extend(q_nodes[:-2])
+        elif q_nodes == q_nodes_2:
+            self.nodes_to_remove.append(q_nodes[1])
+            self.nodes_to_remove.append(q_nodes[2])
+
+        self.prune_graph = True
+
+
+class FusionRotaryEmbeddings(Fusion):
+    def __init__(self, model: OnnxModel):
+        self.base_name = "RotaryEmbedding"
+        super().__init__(model, self.base_name, [self.base_name, self.base_name + ".1", "Add"])
+
+    # The RotaryEmbedding function can have multiple extraneous constant outputs even though the function is supposed to produce only one output.
+    # This is a byproduct of a potential CSE bug when using `export_modules_as_functions` in the TorchScript exporter.
+    # To work around this issue, we set the extraneous constant values from the RotaryEmbedding function as initializers in the locations where they are actually used.
+    def reassign_extra_outputs(self, rot_emb_node: NodeProto, function: FunctionProto):
+        # Find extra outputs and Constant nodes attached to those outputs
+        extra_constants, extra_outputs = [], []
+        for fn_node in function.node:
+            if fn_node.op_type == "Constant" and fn_node.input == [] and fn_node.output[0] in function.output:
+                extra_constants.append(fn_node)
+                output_index = list(function.output).index(fn_node.output[0])
+                extra_outputs.append(rot_emb_node.output[output_index])
+
+        # Set extra Constant node outputs as initializers
+        extra_initializers = []
+        for extra_constant in extra_constants:
+            constant_tensorproto = extra_constant.attribute[0].t
+            constant_tensorproto.name = self.model.create_node_name("Constant")
+            self.model.add_initializer(constant_tensorproto)
+            extra_initializers.append(constant_tensorproto.name)
+
+        # Update references of Constant node outputs to initializer references
+        for extra_output, extra_initializer in zip(extra_outputs, extra_initializers):
+            nodes_to_update = list(filter(lambda entry: extra_output in entry.input, self.model.model.graph.node))
+            for node_to_update in nodes_to_update:
+                OnnxModel.replace_node_input(node_to_update, extra_output, extra_initializer)
+
+        return extra_outputs
+
+    def create_rotary_embeddings_from_function(self, node: NodeProto):
+        rotary_emb_node_name = self.model.create_node_name(self.base_name)
+
+        matmul_path = self.model.match_parent_path(
+            node,
+            ["Reshape", "MatMul"],
+            [0, 0],
+        )
+        if matmul_path is not None:
+            reshape_node, matmul_node = matmul_path
+        else:
+            logger.debug("fuse_rotary_embeddings: failed to match MatMul")
+            return
+
+        rotary_emb_inputs = [
+            matmul_node.output[0],  # x is of shape (B,S,D) instead of (B,S,N,H)
+            node.input[1],  # position_ids
+        ]
+
+        # Convert cos_cache and sin_cache from node attributes to model initializers
+        cos_cache_node = list(filter(lambda constant: constant.output[0] == node.input[2], self.model.model.graph.node))
+        sin_cache_node = list(filter(lambda constant: constant.output[0] == node.input[3], self.model.model.graph.node))
+        cos_cache_name, sin_cache_name = "cos_cache", "sin_cache"
+
+        if (
+            len(cos_cache_node) == 1
+            and len(sin_cache_node) == 1
+            and self.model.get_initializer(cos_cache_name) is None
+            and self.model.get_initializer(sin_cache_name) is None
+        ):
+            cos_cache = numpy_helper.to_array(cos_cache_node[0].attribute[0].t).squeeze()
+            sin_cache = numpy_helper.to_array(sin_cache_node[0].attribute[0].t).squeeze()
+
+            cos_cache_tensor = helper.make_tensor(
+                name=cos_cache_name,
+                data_type=TensorProto.FLOAT,
+                dims=list(cos_cache.shape),
+                vals=cos_cache.flatten().tolist(),
+            )
+            self.model.add_initializer(cos_cache_tensor, self.this_graph_name)
+            sin_cache_tensor = helper.make_tensor(
+                name=sin_cache_name,
+                data_type=TensorProto.FLOAT,
+                dims=list(sin_cache.shape),
+                vals=sin_cache.flatten().tolist(),
+            )
+            self.model.add_initializer(sin_cache_tensor, self.this_graph_name)
+
+            self.nodes_to_remove.extend([cos_cache_node[0], sin_cache_node[0]])
+
+        rotary_emb_inputs.extend([cos_cache_name, sin_cache_name])
+
+        rotary_emb_outputs = node.output
+        if len(rotary_emb_outputs) > 1:
+            # Re-assign extraneous constant outputs in RotaryEmbedding functions as initializers
+            func = list(filter(lambda fn: fn.name == node.op_type, self.model.model.functions))
+            assert len(func) == 1
+            extra_outputs = self.reassign_extra_outputs(node, func[0])
+            rotary_emb_outputs = list(filter(lambda output_name: output_name not in extra_outputs, rotary_emb_outputs))
+            assert len(rotary_emb_outputs) == 1
+
+        rotary_emb_node = helper.make_node(
+            self.base_name,
+            inputs=rotary_emb_inputs,
+            outputs=rotary_emb_outputs,
+            name=rotary_emb_node_name,
+            interleaved=1,
+        )
+        rotary_emb_node.domain = "com.microsoft"
+
+        self.nodes_to_remove.append(reshape_node)
+
+        return rotary_emb_node
+
+    def create_rotary_embeddings_from_nodes(
+        self,
+        root_input: str,
+        position_ids: str,
+        cos_slice: str,
+        sin_slice: str,
+        output: str,
+    ):
+        rotary_emb_node_name = self.model.create_node_name(self.base_name)
+
+        # Convert cos_cache and sin_cache from node attributes to model initializers
+        cos_cache_node = list(filter(lambda constant: constant.output[0] == cos_slice, self.model.model.graph.node))
+        sin_cache_node = list(filter(lambda constant: constant.output[0] == sin_slice, self.model.model.graph.node))
+        cos_cache_name, sin_cache_name = "cos_cache", "sin_cache"
+
+        if (
+            len(cos_cache_node) == 1
+            and len(sin_cache_node) == 1
+            and self.model.get_initializer(cos_cache_name) is None
+            and self.model.get_initializer(sin_cache_name) is None
+        ):
+            cos_cache = numpy_helper.to_array(cos_cache_node[0].attribute[0].t).squeeze()
+            sin_cache = numpy_helper.to_array(sin_cache_node[0].attribute[0].t).squeeze()
+
+            # Reshape cos/sin cache from (M, H) to (M, H/2)
+            head_size = cos_cache.shape[1]
+            cos_cache = cos_cache[:, : (head_size // 2)]
+            sin_cache = sin_cache[:, : (head_size // 2)]
+
+            cos_cache_tensor = helper.make_tensor(
+                name=cos_cache_name,
+                data_type=TensorProto.FLOAT,
+                dims=list(cos_cache.shape),
+                vals=cos_cache.flatten().tolist(),
+            )
+            self.model.add_initializer(cos_cache_tensor, self.this_graph_name)
+            sin_cache_tensor = helper.make_tensor(
+                name=sin_cache_name,
+                data_type=TensorProto.FLOAT,
+                dims=list(sin_cache.shape),
+                vals=sin_cache.flatten().tolist(),
+            )
+            self.model.add_initializer(sin_cache_tensor, self.this_graph_name)
+
+            self.nodes_to_remove.extend([cos_cache_node[0], sin_cache_node[0]])
+
+        rotary_emb_node = helper.make_node(
+            self.base_name,
+            inputs=[root_input, position_ids, cos_cache_name, sin_cache_name],
+            outputs=[output],
+            name=rotary_emb_node_name,
+            interleaved=0,
+        )
+        rotary_emb_node.domain = "com.microsoft"
+        return rotary_emb_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        # Node is either RotaryEmbedding function or Add
+        if self.base_name not in node.op_type and node.op_type != "Add":
+            return
+
+        # Check if node is "RotaryEmbedding nn.Module" exported as a function
+        # (e.g. export_modules_as_functions={RotaryEmbedding} in torch.onnx.export)
+        rotary_emb_node = None
+        if node.op_type != "Add":
+            # Verify that function has the correct inputs
+            if len(node.input) not in {4, 5} or node.input[1] not in {
+                "pos",
+                "pos_id",
+                "position_id",
+                "pos_ids",
+                "position_ids",
+            }:
+                logger.debug("fuse_rotary_embeddings: failed to verify inputs for RotaryEmbedding function")
+                return
+
+            rotary_emb_node = self.create_rotary_embeddings_from_function(node)
+            if rotary_emb_node is None:
+                logger.debug("fuse_rotary_embeddings: failed to create RotaryEmbedding node")
+                return
+
+            # Remove RotaryEmbedding function
+            self.nodes_to_remove.append(node)
+
+            # Remove RotaryEmbedding function's shape inference stored in value_info
+            # The new shape will be calculated during symbolic shape inference
+            old_shape_infer = list(
+                filter(lambda node: node.name == rotary_emb_node.output[0], self.model.model.graph.value_info)
+            )
+            assert len(old_shape_infer) == 1
+            self.model.model.graph.value_info.remove(old_shape_infer[0])
+
+        else:
+            # Rotary embeddings are defined using the below functions:
+            #
+            # def rotate_half(x):
+            #     """Rotates half the hidden dims of the input."""
+            #     x1 = x[..., : x.shape[-1] // 2]
+            #     x2 = x[..., x.shape[-1] // 2 :]
+            #     return torch.cat((-x2, x1), dim=-1)
+            #
+            # def apply_rope(x, cos, sin, position_ids):
+            #     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+            #     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+            #     cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+            #     sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+            #     x_embed = (x * cos) + (rotate_half(x) * sin)
+            #     return x_embed
+
+            # Check paths for rotate_half(x)
+            rotate_half_x2_path_1 = self.model.match_parent_path(
+                node,
+                ["Mul", "Concat", "Neg", "Slice", "Transpose"],
+                [1, 0, 0, 0, 0],
+            )
+            rotate_half_x2_path_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Concat", "Neg", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Transpose"],
+                [1, 0, 0, 0, 1, 0, 0, 0, 0],
+            )
+            if rotate_half_x2_path_1 is None or rotate_half_x2_path_2 is None:
+                logger.debug("fuse_rotary_embeddings: failed to match x2 in rotate_half")
+                return
+
+            rotate_half_x1_path_1 = self.model.match_parent_path(
+                node,
+                ["Mul", "Concat", "Slice", "Transpose"],
+                [1, 0, 1, 0],
+            )
+            rotate_half_x1_path_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Concat", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Transpose"],
+                [1, 0, 1, 2, 0, 0, 0, 0],
+            )
+            if rotate_half_x1_path_1 is None or rotate_half_x1_path_2 is None:
+                logger.debug("fuse_rotary_embeddings: failed to match x1 in rotate_half")
+                return
+
+            if (
+                rotate_half_x1_path_1[-1].name != rotate_half_x1_path_2[-1].name
+                or rotate_half_x2_path_1[-1].name != rotate_half_x2_path_2[-1].name
+                or rotate_half_x1_path_1[-1].name != rotate_half_x2_path_1[-1].name
+                or rotate_half_x1_path_2[-1].name != rotate_half_x2_path_2[-1].name
+            ):
+                logger.debug("fuse_rotary_embeddings: failed to match common input in rotate_half")
+                return
+
+            # Check path for x
+            x_path = self.model.match_parent_path(
+                node,
+                ["Mul", "Transpose"],
+                [0, 0],
+            )
+            if x_path is None:
+                logger.debug("fuse_rotary_embeddings: failed to match x in rotate_half")
+                return
+
+            # Check path for sin
+            sin_path, sin_cache, position_ids = None, "", ""
+            sin_path_1 = self.model.match_parent_path(
+                node,
+                ["Mul", "Unsqueeze", "Gather", "Squeeze", "Squeeze", "Slice", "Unsqueeze", "Gather", "Shape"],
+                [1, 1, 0, 0, 0, 0, 2, 0, 0],
+            )
+            sin_path_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Unsqueeze", "Gather", "Squeeze", "Squeeze", "Slice", "Unsqueeze", "Add"],
+                [1, 1, 0, 0, 0, 0, 2, 0],
+            )
+            sin_path_3 = self.model.match_parent_path(
+                node,
+                ["Mul", "Unsqueeze", "Gather", "Slice", "Unsqueeze", "Gather", "Shape"],
+                [1, 1, 0, 0, 2, 0, 0],
+            )
+            sin_path_4 = self.model.match_parent_path(
+                node,
+                ["Mul", "Unsqueeze", "Gather", "Slice", "Unsqueeze", "Add"],
+                [1, 1, 0, 0, 2, 0],
+            )
+            if sin_path_1 is not None:
+                sin_path = sin_path_1
+                sin_cache = sin_path[-4].input[0]
+            elif sin_path_2 is not None:
+                sin_path = sin_path_2
+                sin_cache = sin_path[-3].input[0]
+            elif sin_path_3 is not None:
+                sin_path = sin_path_3
+                sin_cache = sin_path[-4].input[0]
+                position_ids = sin_path[2].input[1]
+            elif sin_path_4 is not None:
+                sin_path = sin_path_4
+                sin_cache = sin_path[-3].input[0]
+                position_ids = sin_path[2].input[1]
+            else:
+                logger.debug("fuse_rotary_embeddings: failed to match sin path in apply_rope")
+                return
+
+            # Check path for cos
+            cos_path, cos_cache = None, ""
+            cos_path_1 = self.model.match_parent_path(
+                node,
+                ["Mul", "Unsqueeze", "Gather", "Squeeze", "Squeeze", "Slice", "Unsqueeze", "Gather", "Shape"],
+                [0, 1, 0, 0, 0, 0, 2, 0, 0],
+            )
+            cos_path_2 = self.model.match_parent_path(
+                node,
+                ["Mul", "Unsqueeze", "Gather", "Squeeze", "Squeeze", "Slice", "Unsqueeze", "Add"],
+                [0, 1, 0, 0, 0, 0, 2, 0],
+            )
+            cos_path_3 = self.model.match_parent_path(
+                node,
+                ["Mul", "Unsqueeze", "Gather", "Slice", "Unsqueeze", "Gather", "Shape"],
+                [0, 1, 0, 0, 2, 0, 0],
+            )
+            cos_path_4 = self.model.match_parent_path(
+                node,
+                ["Mul", "Unsqueeze", "Gather", "Slice", "Unsqueeze", "Add"],
+                [0, 1, 0, 0, 2, 0],
+            )
+            if cos_path_1 is not None:
+                cos_path = cos_path_1
+                cos_cache = cos_path[-4].input[0]
+            elif cos_path_2 is not None:
+                cos_path = cos_path_2
+                cos_cache = cos_path[-3].input[0]
+            elif cos_path_3 is not None:
+                cos_path = cos_path_3
+                cos_cache = cos_path[-4].input[0]
+                position_ids = cos_path[2].input[1]
+            elif cos_path_4 is not None:
+                cos_path = cos_path_4
+                cos_cache = cos_path[-3].input[0]
+                position_ids = cos_path[2].input[1]
+            else:
+                logger.debug("fuse_rotary_embeddings: failed to match sin path in apply_rope")
+                return
+
+            # Check path for position ids
+            if position_ids == "":
+                position_ids_from_sin_path = self.model.match_parent_path(
+                    sin_path[2],
+                    ["Reshape"],
+                    [1],
+                )
+                position_ids_from_cos_path = self.model.match_parent_path(
+                    cos_path[2],
+                    ["Reshape"],
+                    [1],
+                )
+                if (
+                    position_ids_from_sin_path is None
+                    or position_ids_from_cos_path is None
+                    or position_ids_from_sin_path[0].name != position_ids_from_cos_path[0].name
+                ):
+                    logger.debug("fuse_rotary_embeddings: failed to match position ids path in apply_rope")
+                    return
+                position_ids = position_ids_from_cos_path[0].input[0]
+            else:
+                position_ids_from_sin_path = []
+                position_ids_from_cos_path = []
+
+            past_seq_len_path, curr_seq_len_path = None, None
+            if (sin_path == sin_path_1 and cos_path == cos_path_1) or (
+                sin_path == sin_path_3 and cos_path == cos_path_3
+            ):
+                if sin_path[-2].name != cos_path[-2].name or sin_path[-1].name != cos_path[-1].name:
+                    logger.debug(
+                        "fuse_rotary_embeddings: failed to match common Gather node and Shape node in sin cache and cos cache"
+                    )
+                    return
+            elif (sin_path == sin_path_2 and cos_path == cos_path_2) or (
+                sin_path == sin_path_4 and cos_path == cos_path_4
+            ):
+                if sin_path[-1].name != cos_path[-1].name:
+                    logger.debug("fuse_rotary_embeddings: failed to match common Add node in sin cache and cos cache")
+                    return
+                # Match past sequence length path: past_key --> Shape --> Gather --> Add
+                past_seq_len_path = self.model.match_parent_path(
+                    sin_path[-1],
+                    ["Gather", "Shape"],
+                    [1, 0],
+                )
+                # Match current sequence length path: transpose_k --> Shape --> Gather --> Add
+                curr_seq_len_path = self.model.match_parent_path(
+                    sin_path[-1],
+                    ["Gather", "Shape", "Transpose"],
+                    [0, 0, 0],
+                )
+                if (
+                    past_seq_len_path is None
+                    or curr_seq_len_path is None
+                    or self.model.find_graph_input(past_seq_len_path[-1].input[0]) is None
+                    or curr_seq_len_path[-1].op_type != "Transpose"
+                ):
+                    logger.debug("fuse_rotary_embeddings: failed to match past_seq_len and curr_seq_len paths")
+                    return
+            else:
+                logger.debug("fuse_rotary_embeddings: failed to match common cache paths")
+
+            rotary_emb_node = self.create_rotary_embeddings_from_nodes(
+                rotate_half_x1_path_1[-1].output[0],
+                position_ids,
+                cos_cache,
+                sin_cache,
+                node.output[0],
+            )
+            if rotary_emb_node is None:
+                logger.debug("fuse_rotary_embeddings: failed to create RotaryEmbedding node")
+                return
+
+            # Remove rotary embedding nodes
+            self.add_nodes_to_remove([node])
+            self.add_nodes_to_remove(rotate_half_x1_path_1[:-1])
+            self.add_nodes_to_remove(rotate_half_x1_path_2[:-1])
+            self.add_nodes_to_remove(rotate_half_x2_path_1[:-1])
+            self.add_nodes_to_remove(rotate_half_x2_path_2[:-1])
+            self.add_nodes_to_remove(x_path[:-1])
+            self.add_nodes_to_remove(sin_path)
+            self.add_nodes_to_remove(cos_path)
+            self.add_nodes_to_remove(position_ids_from_sin_path[:-1])
+            self.add_nodes_to_remove(position_ids_from_cos_path[:-1])
+
+            if past_seq_len_path is not None and len(self.model.get_children(past_seq_len_path[0])) == 1:
+                # In merged HF model, output of Gather in past_seq_len_path is used twice
+                # for past_key_values.0.key and once for other past_key_values
+                self.add_nodes_to_remove(past_seq_len_path)
+            if curr_seq_len_path is not None:
+                self.add_nodes_to_remove(curr_seq_len_path[:-1])
+
+        self.increase_counter(self.base_name)
+        self.node_name_to_graph_name[rotary_emb_node.name] = self.this_graph_name
+        self.nodes_to_add.append(rotary_emb_node)
+        self.prune_graph = True
diff --git a/onnxruntime/python/tools/transformers/fusion_shape.py b/onnxruntime/python/tools/transformers/fusion_shape.py
index 11d6b7a8d3cf4..bc32d78eda66c 100644
--- a/onnxruntime/python/tools/transformers/fusion_shape.py
+++ b/onnxruntime/python/tools/transformers/fusion_shape.py
@@ -48,22 +48,22 @@ def fuse(
         input_name_to_nodes: Dict[str, List[NodeProto]],
         output_name_to_node: Dict[str, NodeProto],
     ):
-        """
-        Smplify subgraph like
-
-                   (2d_input)
-                    /       \
-                Shape       shape
-                /             \
-            Gather(indices=0)  Gather(indices=1)
-                |                |
-            Unsqueeze(axes=0)   Unsqueeze(axes=0)
-                   \\          /
-                      Concat
-                        |
-
-        into  (2d_input) --> Shape -->
-        """
+        #
+        # Simplify subgraph like
+        #
+        #          (2d_input)
+        #           /       \
+        #       Shape       shape
+        #       /             \
+        #   Gather(indices=0)  Gather(indices=1)
+        #       |                |
+        #   Unsqueeze(axes=0)   Unsqueeze(axes=0)
+        #          \           /
+        #             Concat
+        #               |
+        #
+        # into  (2d_input) --> Shape -->
+        #
         opset_version = self.model.get_opset_version()
 
         inputs = len(concat_node.input)
diff --git a/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py b/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py
new file mode 100644
index 0000000000000..6f35fa5617a39
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py
@@ -0,0 +1,141 @@
+import logging
+from typing import Dict
+
+from fusion_base import Fusion
+from fusion_skiplayernorm import FusionSkipLayerNormalization
+from onnx import helper
+from onnx_model import OnnxModel
+
+logger = logging.getLogger(__name__)
+
+
+class FusionSimplifiedLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "SimplifiedLayerNormalization", "Mul")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if node.op_type != "Mul":
+            return
+
+        sim_ln_nodes = None
+        # SimplifiedLayerNorm calculation (notation from https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary):
+        # DD = Pow(D, 2)
+        # Var = ReduceMean(DD)
+        # VarEps = Add(Var, epsilon)
+        # StdDev = Sqrt(VarEps)
+        # InvStdDev = Div(1, StdDev)
+        # Normalized = Mul(D, InvStdDev)
+        # NormalizedScaled = Mul(Normalized, Scale)
+
+        #                              SimplifiedLayerNorm
+        #          +-------------------------------------------------------+
+        #          |                                                       |
+        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_1 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
+            [1, 1, 1, 0, 0, 0, 0],
+        )
+        #                                SimplifiedLayerNorm
+        #             +-------------------------------------------------------+
+        #             |                                                       |
+        # Gather --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                     |
+        #                                                                    node
+        sim_ln_nodes_2 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"],
+            [1, 1, 1, 0, 0, 0, 0],
+        )
+
+        # For LLaMA from Microsoft custom export:
+        # sim_ln_nodes_3 uses a different start parent index than sim_ln_nodes_1
+        #
+        #                              SimplifiedLayerNorm
+        #          +-------------------------------------------------------+
+        #          |                                                       |
+        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_3 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
+            [0, 1, 1, 0, 0, 0, 0],
+        )
+
+        # sim_ln_nodes_4 starts with a graph input instead of an Add node like sim_ln_nodes_3
+        #
+        #                                  SimplifiedLayerNorm
+        #                  +-----------------------------------------------+
+        #                  |                                               |
+        # graph_input --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_4 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow"],
+            [0, 1, 1, 0, 0, 0],
+        )
+
+        add_node, pow_node = None, None
+        if sim_ln_nodes_1 is not None:
+            sim_ln_nodes = sim_ln_nodes_1
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_2 is not None:
+            sim_ln_nodes = sim_ln_nodes_2
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_3 is not None:
+            sim_ln_nodes = sim_ln_nodes_3
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_4 is not None:
+            sim_ln_nodes = sim_ln_nodes_4
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-1]
+            # Verify that parent input to Pow node is graph_input
+            if pow_node.input[0] not in self.model.get_graphs_input_names():
+                return
+        else:
+            return
+
+        layernorm_weight_index = 1 if sim_ln_nodes in (sim_ln_nodes_3, sim_ln_nodes_4) else 0
+        starts_with_graph_input = sim_ln_nodes == sim_ln_nodes_4
+
+        if self.model.find_constant_input(pow_node, 2.0) != 1:
+            return
+
+        root_input = pow_node.input[0]
+        if root_input != sim_ln_nodes[0].input[0]:
+            return
+
+        i, add_weight = self.model.get_constant_input(add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expected: {add_weight}")
+            return
+
+        self.nodes_to_remove.extend(sim_ln_nodes[:-1] if not starts_with_graph_input else sim_ln_nodes)
+        self.nodes_to_remove.append(node)
+
+        normalize_node = helper.make_node(
+            "SimplifiedLayerNormalization",
+            inputs=[root_input, node.input[layernorm_weight_index]],
+            outputs=[node.output[0]],
+            name=self.model.create_node_name("SimplifiedLayerNormalization", name_prefix="LayerNorm"),
+        )
+        normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))])
+        normalize_node.attribute.extend([helper.make_attribute("axis", -1)])
+        normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)])
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionSkipSimplifiedLayerNormalization(FusionSkipLayerNormalization):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "SkipSimplifiedLayerNormalization", "SimplifiedLayerNormalization")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        super().fuse(node, input_name_to_nodes, output_name_to_node)
diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
new file mode 100644
index 0000000000000..3b344d6dc9342
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -0,0 +1,385 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""
+Export LLM to onnx
+"""
+import argparse
+import inspect
+import math
+import os
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+import onnx
+import torch
+import transformers
+from torch import nn
+
+
+def disable_huggingface_init():
+    """do not init model twice as it slow initialization"""
+
+    torch.nn.init.kaiming_uniform_ = lambda x, *args, **kwargs: x
+    torch.nn.init.uniform_ = lambda x, *args, **kwargs: x
+    torch.nn.init.normal_ = lambda x, *args, **kwargs: x
+    torch.nn.init.constant_ = lambda x, *args, **kwargs: x
+    torch.nn.init.xavier_uniform_ = lambda x, *args, **kwargs: x
+    torch.nn.init.xavier_normal_ = lambda x, *args, **kwargs: x
+    torch.nn.init.kaiming_normal_ = lambda x, *args, **kwargs: x
+    torch.nn.init.orthogonal_ = lambda x, *args, **kwargs: x
+
+
+def get_model_parameter_size(model: nn.Module):
+    """to calculate how much memory this model needs"""
+    param_size = 0
+    param_sum = 0
+    for param in model.parameters():
+        param_size += param.nelement() * param.element_size()
+        param_sum += param.nelement()
+    buffer_size = 0
+    buffer_sum = 0
+    for buffer in model.buffers():
+        buffer_size += buffer.nelement() * buffer.element_size()
+        buffer_sum += buffer.nelement()
+    all_size = (param_size + buffer_size) / 1024 / 1024
+    return all_size
+
+
+def initialize_model_and_sample_inputs(hf_model: str, cache_dir: Optional[str], tokenizer=None):
+    """
+    get the pretrained torch model from hugginface,
+    and sample model-inputs
+    """
+
+    disable_huggingface_init()
+
+    model = transformers.AutoModelForCausalLM.from_pretrained(  # type: ignore
+        hf_model, torch_dtype=torch.float16, cache_dir=cache_dir, trust_remote_code=True
+    )
+    if tokenizer is None:
+        tokenizer = hf_model
+    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer)  # type: ignore
+
+    sample_inputs = tuple(tokenizer("Hello, my dog is cute", return_tensors="pt").values())
+    return model, sample_inputs
+
+
+def auto_pipeline_parallel(model: nn.Module, gpulist: list, sample_inputs: tuple):
+    """Make the model executable across multiple GPUs."""
+
+    def input_gpu_device_hook(mod, inputs, kwargs):
+        modifyed_inputs = []
+        first_dev = None
+        for layer_input in inputs:
+            if type(layer_input) is not torch.Tensor:
+                modifyed_inputs.append(layer_input)
+            elif hasattr(mod, "weight"):
+                modifyed_inputs.append(layer_input.to(mod.weight.device))
+            elif hasattr(mod, "parameters"):
+                device = next(mod.parameters(), layer_input).device
+                modifyed_inputs.append(layer_input.to(device))
+            elif hasattr(next(mod.children(), None), "weight"):
+                modifyed_inputs.append(layer_input.to(next(mod.children()).weight.device))
+            elif first_dev is not None and layer_input.device != first_dev:
+                modifyed_inputs.append(layer_input.to(first_dev))
+            else:
+                modifyed_inputs.append(layer_input)
+            if first_dev is None:
+                first_dev = modifyed_inputs[0].device
+        for key, value in kwargs.items():
+            if type(value) is torch.Tensor:
+                kwargs[key] = value.to(first_dev)
+
+        return (tuple(modifyed_inputs), kwargs)
+
+    def move_layer_to_device_rurc(mod, dev):
+        mod.to(dev)
+        for layer in mod.named_children():
+            move_layer_to_device_rurc(layer[1], dev)
+
+    model = model.half()
+    all_hooks = []
+    all_hooks.append(model.register_forward_pre_hook(input_gpu_device_hook, with_kwargs=True))
+    pre_fix = next(iter(model.named_children()))[0]
+    for top_name, top_module in model.named_children():
+        for name, module in top_module.named_children():
+            all_hooks.append(module.register_forward_pre_hook(input_gpu_device_hook, with_kwargs=True))
+            if type(module) in [torch.nn.ModuleList]:
+                num_layers_on_each_gpu = math.floor(len(module) / len(gpulist))
+                for idx, attn_layer in enumerate(module):
+                    all_hooks.append(attn_layer.register_forward_pre_hook(input_gpu_device_hook, with_kwargs=True))
+
+                    to_dev = gpulist[min(idx // num_layers_on_each_gpu, len(gpulist))]
+                    attn_layer.to(to_dev)
+                    move_layer_to_device_rurc(attn_layer, to_dev)
+                    print(f"move {pre_fix}.{name}.{idx} to {to_dev}")
+            else:
+                module.to(gpulist[0])
+                print(f"move {pre_fix}.{name} to {gpulist[0]}")
+        if len(list(top_module.named_children())) == 0:
+            top_module.to(gpulist[0])
+            print(f"move {top_name} to {gpulist[0]}")
+
+    with torch.no_grad():
+        model(sample_inputs[0], attention_mask=sample_inputs[1])
+    return model
+
+
+def retrieve_onnx_inputs(model: nn.Module, sample_inputs: tuple, with_past: bool):
+    """
+    auto retrieve onnx inputs from torch model as we can't enumlate all possibilities
+    for all models
+    """
+    user_inputs = []
+
+    def hook_for_inputs(_, inputs, kwargs):
+        user_inputs.append((inputs, kwargs))
+        return user_inputs[0]
+
+    hook_handle = model.register_forward_pre_hook(hook_for_inputs, with_kwargs=True)
+
+    forward_params = inspect.signature(model.forward).parameters
+    input_keys = list(forward_params.keys())
+    default_values = [forward_params.get(key).default for key in input_keys]
+    out = model(sample_inputs[0], attention_mask=sample_inputs[1])
+    hook_handle.remove()
+    user_inputs = user_inputs[0]
+    onnx_inputs = default_values
+    for idx, _val in enumerate(user_inputs[0]):
+        onnx_inputs[idx] = user_inputs[0][idx]
+    for key, value in user_inputs[1].items():
+        idx = input_keys.index(key)
+        onnx_inputs[idx] = value
+    for idx, (key, value) in enumerate(zip(input_keys, onnx_inputs)):
+        if type(value) is torch.Tensor:
+            value.to(model.device)
+        # Didn't touch past_key_value now, please change it if you want
+        if "use_cache" in key:
+            onnx_inputs[idx] = with_past
+
+    return input_keys, onnx_inputs, out.past_key_values
+
+
+def move_to_approprate_device(model: nn.Module, sample_inputs_tp: tuple) -> nn.Module:
+    """
+    According to the model size, we will upload it to
+    CPU if has no GPU or enough GPU memory,
+    Single GPU if has only one GPU in local or model size is enough to fit one GPU
+    Multiple GPU if there is more than one gpu in local and model is too large
+    """
+    total_mem_per_cpu = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024
+
+    print(f"Model_Size = {get_model_parameter_size(model)/1024} GB")
+    print(f"total_mem_per_cpu = {total_mem_per_cpu/1024} GB")
+    if get_model_parameter_size(model) > total_mem_per_cpu * 0.45:
+        device_collection = [torch.device(i) for i in range(torch.cuda.device_count())]
+        if len(device_collection) > 1:
+            print(
+                f"{len(device_collection)} GPUs are used to export onnx, \
+                   Please set CUDA_VISIBLE_DEVICES to use specific GPU group"
+            )
+            model = auto_pipeline_parallel(model, device_collection, sample_inputs_tp)
+        else:
+            print("!!!! convert model to float and export onnx using CPU")
+            model = model.cpu().float()
+    else:
+        print("Export model on a single GPU")
+        model = model.cuda().half()
+    return model
+
+
+def adapt_inputs_to_device(sample_inputs: tuple, device: torch.device) -> tuple:
+    """move inputs to device"""
+    sample_inputs_ = []
+    for sample_int in sample_inputs:
+        if isinstance(sample_int, torch.Tensor):
+            sample_inputs_.append(sample_int.to(device))
+        else:
+            sample_inputs_.append(sample_int)
+    return tuple(sample_inputs_)
+
+
+def fetch_onnx_inputs_outputs_name(
+    model: nn.Module,
+    onnx_inputs: list,
+    torch_input_names: tuple,
+    past_key_values: tuple,
+    with_past: bool,
+    input_with_past: bool,
+):
+    """fetch onnx inputs and outputs name"""
+    num_of_past_key = 0
+    kv_cache_axis = {0: "batch_size"}
+    # try get num_of_past_key and shape of past_key_value
+    if past_key_values is not None:
+        num_of_past_key = len(past_key_values)
+        seq_index = (torch.tensor(past_key_values[0][0].shape) == onnx_inputs[0].shape[-1]).nonzero().view(-1)
+        assert seq_index.numel() == 1
+        kv_cache_axis = {0: "batch_size", seq_index.item(): "seq_len"}
+
+    if not num_of_past_key:
+        num_of_past_key = model.config.num_hidden_layers
+
+    onnx_inp_names = ("input_ids", "attention_mask")
+    onnx_out_names = ("logits",)
+    onnx_dynamic_axes = {
+        "input_ids": {0: "batch_size", 1: "seq_len"},
+        "attention_mask": {0: "batch_size", 1: "seq_len"},
+    }
+    if input_with_past:
+        for i in range(num_of_past_key):
+            onnx_inp_names += (f"present_key.{i}",)
+            onnx_inp_names += (f"present_values.{i}",)
+
+            onnx_dynamic_axes[onnx_inp_names[-1]] = kv_cache_axis
+            onnx_dynamic_axes[onnx_inp_names[-2]] = kv_cache_axis
+
+    if with_past or input_with_past:
+        for i in range(num_of_past_key):
+            onnx_out_names += (f"past_key.{i}",)
+            onnx_out_names += (f"past_values.{i}",)
+            onnx_dynamic_axes[onnx_out_names[-1]] = kv_cache_axis
+            onnx_dynamic_axes[onnx_out_names[-2]] = kv_cache_axis
+
+    for idx, name in enumerate(torch_input_names):
+        if input_with_past:
+            if name == "past_key_values":
+                onnx_inputs[idx] = past_key_values
+            elif name == "attention_mask":
+                attn_mask = onnx_inputs[idx]
+                onnx_inputs[idx] = torch.cat(
+                    (attn_mask, torch.ones((attn_mask.shape[0], 1), device=attn_mask.device)), dim=1
+                )
+            elif name == "input_ids":
+                input_ids = onnx_inputs[idx]
+                onnx_inputs[idx] = input_ids[:, -1:]
+
+    return onnx_inp_names, onnx_out_names, onnx_dynamic_axes
+
+
+def do_export_internal(model: nn.Module, onnx_io_tuple: tuple, onnx_inputs: tuple, onnx_path: Path, opset: int):
+    """do export with torch.onnx.export"""
+    onnx_model_name = onnx_path.name
+    onnx_inp_names, onnx_out_names, onnx_dynamic_axes = onnx_io_tuple
+    # two step to export onnx
+    # 1. export onnx with lots of pieces of weights
+    # 2. save all weights to external data
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        tmp_onnx = os.path.join(tmpdirname, "tmp.onnx")
+
+        torch.onnx.export(
+            model=model,
+            args=tuple(onnx_inputs),
+            f=tmp_onnx,
+            verbose=False,
+            opset_version=opset,
+            input_names=onnx_inp_names,
+            output_names=onnx_out_names,
+            dynamic_axes=onnx_dynamic_axes,
+        )
+
+        onnx_path.unlink(missing_ok=True)
+        (onnx_path.parent / f"{onnx_model_name}_ext.data").unlink(missing_ok=True)
+
+        onnx_model = onnx.load(str(tmp_onnx))
+        onnx.save_model(
+            onnx_model,
+            str(onnx_path),
+            save_as_external_data=(len(os.listdir(tmpdirname)) > 1),
+            all_tensors_to_one_file=True,
+            location=f"{onnx_model_name}_ext.data",
+            size_threshold=1024,
+            convert_attribute=False,
+        )
+
+
+@torch.no_grad()
+def export_onnx(hf_model: str, cache_dir: Optional[str], onnx_path_str: str, with_past: bool, opset: int):
+    """
+    do export
+    model: torch model
+    onnx_path: where the onnx model saved to
+    sample_inputs_tp: inputs for torch model
+    """
+    model, sample_inputs_tp = initialize_model_and_sample_inputs(hf_model, cache_dir)
+
+    model = move_to_approprate_device(model, sample_inputs_tp)
+
+    sample_inputs = adapt_inputs_to_device(sample_inputs_tp, next(model.parameters()).device)
+
+    # input_keys would be usesful if the model has some special inputs
+    input_keys, onnx_inputs, past_key_value = retrieve_onnx_inputs(model, sample_inputs, with_past)
+
+    onnx_io_tuple = fetch_onnx_inputs_outputs_name(model, onnx_inputs, input_keys, past_key_value, with_past, False)
+
+    onnx_model_name = "model.onnx"
+    onnx_path: Path = Path(onnx_path_str).absolute()
+    if onnx_path.suffix != ".onnx":
+        onnx_path = onnx_path / onnx_model_name
+
+    do_export_internal(model, onnx_io_tuple, onnx_inputs, onnx_path, opset)
+    if not with_past:
+        return
+
+    onnx_io_tuple = fetch_onnx_inputs_outputs_name(model, onnx_inputs, input_keys, past_key_value, with_past, True)
+
+    onnx_model_name = "model_with_past.onnx"
+    onnx_path = onnx_path.parent / onnx_model_name
+
+    do_export_internal(model, onnx_io_tuple, onnx_inputs, onnx_path, opset)
+
+
+def parse_arguments():
+    """arguments parsing."""
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model",
+        required=True,
+        type=str,
+        default=["meta-llama/Llama-2-70b-hf"],
+        help="Pre-trained models in huggingface model hub",
+    )
+    parser.add_argument(
+        "-s",
+        "--saved_path",
+        required=False,
+        type=str,
+        default="./onnx_models/",
+        help="where the onnx model will be saved",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        required=False,
+        type=str,
+        default=None,
+        help=("cache directy of huggingface, by setting this to avoid useless downloading if you have one"),
+    )
+    parser.add_argument(
+        "--with_past",
+        action="store_true",
+        default=False,
+        help=("The tool will export onnx without past-key-value by default"),
+    )
+    parser.add_argument(
+        "--opset",
+        required=False,
+        type=int,
+        default=17,
+        help=(
+            "the opset to save onnx model, \
+              try to increase it if this opset doens't have new features you want"
+        ),
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+
+    export_onnx(args.model, args.cache_dir, args.saved_path, args.with_past, args.opset)
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index b4461a2eadb8c..6057b46667fe6 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -17,12 +17,31 @@ $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama
 
 To make this option compatible with [Hugging Face's Optimum](https://github.com/huggingface/optimum), you will need to create `config.json` and `generation_config.json` for your model and store them in the same directory as your ONNX models. For example, you can find those JSON files for LLaMA-2 7B on Hugging Face [here](https://huggingface.co/meta-llama/Llama-2-7b-hf).
 
+As indicated in `requirements.txt`, you will also need to install Optimum from source. Once installed, you will need to modify `ORTModelForCausalLM.forward` in `optimum/optimum/onnxruntime/modeling_decoder.py` as follows:
+
+```
+# Before
+if self.use_cache:
+    if past_key_values is not None:
+        input_ids = input_ids[:, -1:]
+        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
+
+
+# After
+if self.use_cache:
+    if past_key_values is not None:
+        input_ids = input_ids[:, -1:] if past_key_values[0][0].shape[2] != 0 else input_ids
+        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
+```
+
 ### Option 2: from [Microsoft's custom export](https://github.com/microsoft/Llama-2-Onnx)
 
 Please follow the [README instructions](https://github.com/microsoft/Llama-2-Onnx#before-you-start) in the custom export of LLaMA-2.
 
 ### Option 3: from [Hugging Face Optimum](https://github.com/huggingface/optimum)
 
+Note that this will produce two ONNX models whereas the above two options produce one ONNX model. 
+
 First, log into the Hugging Face CLI in your terminal:
 
 ```
@@ -56,38 +75,81 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input ./
 $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input ./Llama-2-7b-hf --output ./llama2-7b
 ```
 
-Export for FP16
+Export for FP32 CUDA
+```
+# From source:
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp32-gpu --precision fp32 --execution_provider cuda
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp32 --precision fp32 --execution_provider cuda
+```
+
+Export for FP32 CPU
 ```
 # From source:
-$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp32-cpu --precision fp32 --execution_provider cpu
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp32 --precision fp32 --execution_provider cpu
 ```
 
-Export for INT8
+Export for FP16 CUDA
 ```
 # From source:
-$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int8 --precision int8 --quantization_method smooth_quant
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda
 
 # From wheel:
-$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int8 --precision int8 --quantization_method smooth_quant
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda
+```
+
+Export for INT8 CPU (SmoothQuant)
+```
+# From source:
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int8 --precision int8 --quantization_method smooth_quant --execution_provider cpu
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int8 --precision int8 --quantization_method smooth_quant --execution_provider cpu
 ```
 
 Note: [Intel's Neural Compressor](https://github.com/intel/neural-compressor) takes time to run the SmoothQuant quantization algorithm on LLMs. On an [Azure Standard_NC24s_v3 VM](https://learn.microsoft.com/en-us/azure/virtual-machines/ncv3-series), it takes about ~30-45 min for each of the exported ONNX models.
 
+Export for INT8 CPU (DynamicQuant)
+```
+# From source:
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int8 --precision int8 --quantization_method quantize_dynamic --execution_provider cpu
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int8 --precision int8 --quantization_method quantize_dynamic --execution_provider cpu
+```
+
+Export for INT4 CUDA
+```
+# From source:
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-gpu --precision int4 --quantization_method blockwise --execution_provider cuda
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4 --precision int4 --quantization_method blockwise --execution_provider cuda
+```
+
+Export for INT4 CPU
+```
+# From source:
+$ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-cpu --precision int4 --quantization_method blockwise --execution_provider cpu
+
+# From wheel:
+$ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4 --precision int4 --quantization_method blockwise --execution_provider cpu
+```
+
 ## Benchmark LLaMA-2
 
 Here are some examples of how you can benchmark LLaMA-2.
 
-Note: In the below examples, `PyTorch` refers to running in PyTorch without `torch.compile` and `PyTorch 2.0` refers to running in PyTorch with `torch.compile`.
-
 ### Variants
 
-1. PyTorch (without `torch.compile`), FP32
+1. PyTorch without `torch.compile`, FP32
 ```
 python3 -m models.llama.benchmark \
-    --benchmark-type hf-pt \
+    --benchmark-type hf-pt-eager \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp32 \
     --batch-sizes "1 2" \
@@ -96,10 +158,10 @@ python3 -m models.llama.benchmark \
     --auth
 ```
 
-2. PyTorch 2.0 (with `torch.compile`), FP16
+2. PyTorch with `torch.compile`, FP16
 ```
 python3 -m models.llama.benchmark \
-    --benchmark-type hf-pt2 \
+    --benchmark-type hf-pt-compile \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp16 \
     --batch-sizes "1 2" \
@@ -112,7 +174,7 @@ python3 -m models.llama.benchmark \
 ```
 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
-    --hf-ort-model-path ./Llama-2-7b-hf-onnx/ \
+    --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp32 \
     --batch-sizes "1 2" \
@@ -125,7 +187,7 @@ python3 -m models.llama.benchmark \
 ```
 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
-    --hf-ort-model-path ./llama2-7b-fp16/ \
+    --hf-ort-dir-path ./llama2-7b-fp16/ \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp16 \
     --batch-sizes "1 2" \
@@ -134,24 +196,35 @@ python3 -m models.llama.benchmark \
     --auth
 ```
 
-5. Optimum + ONNX Runtime, INT8, export via convert_to_onnx
+5. ONNX Runtime, FP32, Microsoft custom export
 ```
 python3 -m models.llama.benchmark \
-    --benchmark-type hf-ort \
-    --hf-ort-model-path ./llama2-7b-int8/ \
+    --benchmark-type ort-msft \
+    --ort-model-path ./llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
-    --precision int8 \
+    --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
-    --device cpu \
-    --auth
+    --device cpu
 ```
 
-6. ONNX Runtime, FP32, Microsoft custom export
+6. ONNX Runtime, FP16, Microsoft custom export
 ```
 python3 -m models.llama.benchmark \
-    --benchmark-type ort \
-    --ort-model-path llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
+    --benchmark-type ort-msft \
+    --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --sequence-lengths "8 16" \
+    --device cuda
+```
+
+7. ONNX Runtime, FP32, convert_to_onnx
+```
+python3 -m models.llama.benchmark \
+    --benchmark-type ort-convert-to-onnx \
+    --ort-model-path ./llama2-7b/Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp32 \
     --batch-sizes "1 2" \
@@ -159,11 +232,11 @@ python3 -m models.llama.benchmark \
     --device cpu
 ```
 
-7. ONNX Runtime, FP16, Microsoft custom export
+8. ONNX Runtime, FP16, convert_to_onnx
 ```
 python3 -m models.llama.benchmark \
-    --benchmark-type ort \
-    --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
+    --benchmark-type ort-convert-to-onnx \
+    --ort-model-path ./llama2-7b/Llama-2-7b-hf_decoder_merged_model_fp16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp16 \
     --batch-sizes "1 2" \
@@ -174,11 +247,14 @@ python3 -m models.llama.benchmark \
 You can profile a variant by adding the `--profile` flag and providing one batch size and sequence length combination.
 
 ### Benchmark All
-You can use `benchmark_all.py` to benchmark across various platforms and automatically store the results in a CSV file. Here is an example.
+You can use `benchmark_all.py` to benchmark across various options and automatically store the results in a CSV file. Here is an example.
 ```
 python3 -m models.llama.benchmark_all \
-    --hf-ort-model-path ./llama2-7b-fp16/ \
-    --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
+    --hf-pt-eager \
+    --hf-pt-compile \
+    --hf-ort-dir-path ./llama2-7b-fp16/ \
+    --ort-convert-to-onnx-model-path ./llama2-7b-fp16/Llama-2-7b-hf_decoder_merged_model_fp16.onnx \
+    --ort-msft-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp16 \
     --batch-sizes "1 2" \
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
index d19ed5cc28fed..976de2abc7c57 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
@@ -8,10 +8,17 @@
 import time
 
 import numpy as np
+import onnx
 import psutil
 import torch
 from benchmark_helper import setup_logger
-from llama_inputs import get_msft_sample_inputs, get_sample_inputs, get_sample_with_past_kv_inputs
+from llama_inputs import (
+    convert_inputs_for_ort,
+    get_merged_sample_with_past_kv_inputs,
+    get_msft_sample_inputs,
+    get_sample_inputs,
+    get_sample_with_past_kv_inputs,
+)
 from optimum.onnxruntime import ORTModelForCausalLM
 from torch.profiler import ProfilerActivity, profile, record_function
 from tqdm import trange
@@ -23,8 +30,29 @@
 logger = logging.getLogger(__name__)
 
 
-def get_inputs(args: argparse.Namespace):
-    if args.benchmark_type in {"hf-pt", "hf-pt2", "hf-ort"}:
+# For determining whether the ONNX model can do both prompt generation and token generation or only one of the two
+def get_ort_model_inputs_len(args, model):
+    if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
+        return 0
+    if args.benchmark_type == "hf-ort":
+        try:
+            # New Optimum export (https://github.com/huggingface/optimum/blob/888332364c2e0091da1fc974737c7e277af168bf/optimum/onnxruntime/modeling_ort.py#L268)
+            return len(model.inputs_names)
+        except Exception:
+            # Old Optimum export (https://github.com/huggingface/optimum/blob/c5ad7f971cb0a494eac03dc0909f146725f999c5/optimum/onnxruntime/base.py#L54)
+            return len(model.decoder.input_names)
+    return len(model.get_inputs())
+
+
+def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
+    init_inputs, iter_inputs = None, None
+
+    # For past_present_share_buffer:
+    # Set max_seq_len to 2048 for Hugging Face model since that is the default value
+    # Set max_seq_len to 2048 for Microsoft model since that is the max value currently supported
+    max_seq_len = 2048
+
+    if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
         init_inputs = get_sample_inputs(
             args.config,
             args.target_device,
@@ -41,14 +69,95 @@ def get_inputs(args: argparse.Namespace):
             return_dict=True,
         )
 
-    elif args.benchmark_type == "ort":
+    elif args.benchmark_type == "hf-ort":
+        if ort_model_inputs_len == 3:  # [input_ids, attention_mask, position_ids]
+            # Using split models in Optimum (e.g. created by Optimum export)
+            init_inputs = get_sample_inputs(
+                args.config,
+                args.target_device,
+                args.batch_size,
+                args.sequence_length,
+                return_dict=True,
+            )
+            iter_inputs = get_sample_with_past_kv_inputs(
+                args.config,
+                args.target_device,
+                args.batch_size,
+                args.sequence_length,
+                use_fp16=args.use_fp16,
+                return_dict=True,
+            )
+        else:
+            # Using merged model in Optimum (e.g. created by convert_to_onnx export)
+            init_inputs = get_merged_sample_with_past_kv_inputs(
+                args.config,
+                args.target_device,
+                args.batch_size,
+                seq_len=args.sequence_length,
+                past_seq_len=0,
+                use_fp16=args.use_fp16,
+                return_dict=True,
+            )
+            iter_inputs = get_merged_sample_with_past_kv_inputs(
+                args.config,
+                args.target_device,
+                args.batch_size,
+                seq_len=1,
+                past_seq_len=args.sequence_length,
+                use_fp16=args.use_fp16,
+                return_dict=True,
+            )
+
+    elif args.benchmark_type == "ort-convert-to-onnx":
+        # Microsoft export from convert_to_onnx
+        init_inputs = get_merged_sample_with_past_kv_inputs(
+            args.config,
+            args.target_device,
+            args.batch_size,
+            seq_len=args.sequence_length,
+            past_seq_len=0,
+            use_fp16=args.use_fp16,
+            return_dict=True,
+        )
+        iter_inputs = get_merged_sample_with_past_kv_inputs(
+            args.config,
+            args.target_device,
+            args.batch_size,
+            seq_len=1,
+            past_seq_len=args.sequence_length,
+            use_fp16=args.use_fp16,
+            return_dict=True,
+        )
+        init_inputs = convert_inputs_for_ort(
+            init_inputs,
+            use_fp16=args.use_fp16,
+            use_buffer_share=args.past_present_share_buffer,
+            past_seq_len=0,
+            max_seq_len=max_seq_len,
+            device=args.device,
+            device_id=args.device_id,
+        )
+        iter_inputs = convert_inputs_for_ort(
+            iter_inputs,
+            use_fp16=args.use_fp16,
+            use_buffer_share=args.past_present_share_buffer,
+            past_seq_len=args.sequence_length,
+            max_seq_len=max_seq_len,
+            device=args.device,
+            device_id=args.device_id,
+        )
+
+    elif args.benchmark_type == "ort-msft":
         # Microsoft export from https://github.com/microsoft/Llama-2-Onnx
+        split_kv = ort_model_inputs_len > 5  # original inputs: [x, attn_mask, k_cache, v_cache, pos]
+
         init_inputs = get_msft_sample_inputs(
             args.config,
             args.batch_size,
             past_seq_len=0,
             seq_len=args.sequence_length,
             use_fp16=args.use_fp16,
+            split_kv=split_kv,
         )
         iter_inputs = get_msft_sample_inputs(
             args.config,
@@ -56,6 +165,25 @@ def get_inputs(args: argparse.Namespace):
             past_seq_len=args.sequence_length,
             seq_len=1,
             use_fp16=args.use_fp16,
+            split_kv=split_kv,
+        )
+        init_inputs = convert_inputs_for_ort(
+            init_inputs,
+            use_fp16=args.use_fp16,
+            use_buffer_share=args.past_present_share_buffer,
+            past_seq_len=0,
+            max_seq_len=max_seq_len,
+            device=args.device,
+            device_id=args.device_id,
+        )
+        iter_inputs = convert_inputs_for_ort(
+            iter_inputs,
+            use_fp16=args.use_fp16,
+            use_buffer_share=args.past_present_share_buffer,
+            past_seq_len=args.sequence_length,
+            max_seq_len=max_seq_len,
+            device=args.device,
+            device_id=args.device_id,
         )
 
     else:
@@ -69,12 +197,14 @@ def get_model(args: argparse.Namespace):
     start_time, end_time = None, None
 
     # There are multiple sources that the model could come from:
-    # 1) Benchmark LLaMA from unofficial source on Hugging Face
-    # 2) Benchmark LLaMA from official source on Hugging Face, which requires an authentication token
-    # 3) Benchmark LLaMA from local download of model
-
-    if args.benchmark_type in {"hf-pt", "hf-pt2"}:
-        source = args.hf_pt_model_path if args.hf_pt_model_path else args.model_name
+    # 1) Benchmark LLaMA-2 from unofficial source on Hugging Face
+    # 2) Benchmark LLaMA-2 from official source on Hugging Face, which requires an authentication token
+    # 3) Benchmark LLaMA-2 from local download of model
+    # 4) Benchmark LLaMA-2 from Microsoft (already optimized, available at https://github.com/microsoft/Llama-2-Onnx)
+    # 5) Benchmark LLaMA-2 from convert_to_onnx
+
+    if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
+        source = args.hf_pt_dir_path if args.hf_pt_dir_path else args.model_name
         start_time = time.time()
         model = LlamaForCausalLM.from_pretrained(
             source,
@@ -84,10 +214,10 @@ def get_model(args: argparse.Namespace):
         ).to(args.target_device)
         end_time = time.time()
 
-        if args.benchmark_type == "hf-pt2":
+        if args.benchmark_type == "hf-pt-compile":
             model = torch.compile(model)
 
-    elif args.benchmark_type in {"hf-ort", "ort"}:
+    elif args.benchmark_type in {"hf-ort", "ort-msft", "ort-convert-to-onnx"}:
         sess_options = ort.SessionOptions()
         sess_options.enable_profiling = args.profile
         if args.verbose:
@@ -104,32 +234,33 @@ def get_model(args: argparse.Namespace):
 
         decoder_file_name = None
         decoder_with_past_file_name = None
-        for filename in os.listdir(args.hf_ort_model_path):
+        for filename in os.listdir(args.hf_ort_dir_path):
             if ".onnx" not in filename or ".onnx_data" in filename or ".onnx.data" in filename:
                 continue
-            if "decoder_model.onnx" in filename or f"decoder_model_{args.precision}.onnx" in filename:
+            if "decoder_model" in filename or filename == "model.onnx":
+                decoder_file_name = filename
+            if "decoder_with_past_model" in filename:
+                decoder_with_past_file_name = filename
+            if "decoder_merged_model" in filename:
                 decoder_file_name = filename
-            if (
-                "decoder_with_past_model.onnx" in filename
-                or f"decoder_with_past_model_{args.precision}.onnx" in filename
-            ):
                 decoder_with_past_file_name = filename
 
         start_time = time.time()
         model = ORTModelForCausalLM.from_pretrained(
-            args.hf_ort_model_path,
+            args.hf_ort_dir_path,
             decoder_file_name=decoder_file_name,
             decoder_with_past_file_name=decoder_with_past_file_name,
             use_auth_token=args.auth,
             use_io_binding=(args.device != "cpu"),
+            use_merged=(True if decoder_file_name == "model.onnx" else None),
             provider=provider,
             provider_options=provider_options,
             session_options=sess_options,
         )
         end_time = time.time()
 
-    if args.benchmark_type == "ort":
-        # Microsoft export from https://github.com/microsoft/Llama-2-Onnx
+    if args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}:
+        # Ex: Microsoft export from https://github.com/microsoft/Llama-2-Onnx
         logger.info(f"Loading model from {args.ort_model_path}")
         start_time = time.time()
         model = ort.InferenceSession(
@@ -140,7 +271,6 @@ def get_model(args: argparse.Namespace):
         end_time = time.time()
 
     logger.info(f"Loaded model in {end_time - start_time} s")
-
     return model
 
 
@@ -148,7 +278,7 @@ def time_fn(args, fn, inputs):
     # Warm up
     warmup_range = (
         range(args.warmup_runs)
-        if args.benchmark_type == "ort"
+        if args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}
         else trange(args.warmup_runs, file=sys.stdout, desc="Warm up")
     )
 
@@ -166,7 +296,7 @@ def time_fn(args, fn, inputs):
 
     bench_range = (
         range(args.num_runs)
-        if args.benchmark_type == "ort"
+        if args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}
         else trange(args.num_runs, file=sys.stdout, desc="Benchmark")
     )
     for _ in bench_range:
@@ -177,7 +307,7 @@ def time_fn(args, fn, inputs):
     end_time = time.time()
 
     # Newline print after trange in order to print metrics on new lines without progress bar on same line
-    if args.benchmark_type != "ort":
+    if args.benchmark_type not in {"ort-msft", "ort-convert-to-onnx"}:
         logger.info("")
 
     latency = (end_time - start_time) / args.num_runs
@@ -186,7 +316,7 @@ def time_fn(args, fn, inputs):
     logger.info(f"Batch Size: {args.batch_size}")
     logger.info(f"Sequence Length: {args.sequence_length}")
     logger.info(f"Latency: {latency} s")
-    logger.info(f"Throughput: {throughput} qps")
+    logger.info(f"Throughput: {throughput} tps")
     return
 
 
@@ -196,7 +326,7 @@ def profile_fn(args, fn, inputs, inputs_type):
     prefix = f"b{args.batch_size}_s{args.sequence_length}_{args.benchmark_type.lower()}-{args.precision}-{args.device}_{fn.__name__.replace('_', '-')}_{inputs_type}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}"
     filename = None
 
-    if args.benchmark_type in {"hf-pt", "hf-pt2"}:
+    if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
         # Profile PyTorch kernels
         with profile(  # noqa: SIM117
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True
@@ -267,7 +397,7 @@ def get_logits(inputs):
 
     generate_fn = get_logits
 
-    if args.benchmark_type == "hf-pt2":
+    if args.benchmark_type == "hf-pt-compile":
         # Run forward pass once with each set of inputs to process through Dynamo
         generate_fn(init_inputs)
         generate_fn(iter_inputs)
@@ -280,7 +410,7 @@ def get_logits(inputs):
             logger.warning(f"Renaming {old_logname} to {new_logname}")
             os.rename(old_logname, os.path.join(args.log_folder, new_logname))
 
-        new_logname = profile_fn(args, generate_fn, iter_inputs, "per-token")
+        new_logname = profile_fn(args, generate_fn, iter_inputs, "token")
         if args.benchmark_type == "hf-ort":
             # Turn profiling off to stop appending to log
             old_logname = model.decoder_with_past.session.end_profiling()
@@ -319,10 +449,24 @@ def prepare_ort_inputs(inputs):
         # Add IO bindings for non-CPU execution providers
         if args.device != "cpu":
             io_binding = model.io_binding()
+
             for k, v in inputs.items():
-                io_binding.bind_cpu_input(k, v)
+                if args.past_present_share_buffer:
+                    # Bind all OrtValue inputs to device
+                    io_binding.bind_ortvalue_input(k, v)
+                else:
+                    io_binding.bind_cpu_input(k, v)
+
             for output in model.get_outputs():
-                io_binding.bind_output(output.name)
+                name = output.name
+                if args.past_present_share_buffer and ("out" in name or "present" in name):
+                    # Bind present KV cache outputs to OrtValue with buffer sharing
+                    io_binding.bind_ortvalue_output(
+                        name, inputs[name.replace("out", "cache").replace("present", "past_key_values")]
+                    )
+                else:
+                    io_binding.bind_output(name, device_type=args.device, device_id=args.device_id)
+
             return io_binding
 
         return inputs
@@ -350,7 +494,7 @@ def without_io_binding(inputs):
         # Re-initialize model for new log file instead of appending to old log file
         model = get_model(args)
         ort_iter_inputs = prepare_ort_inputs(iter_inputs)
-        new_logname = profile_fn(args, generate_fn, ort_iter_inputs, "per-token")
+        new_logname = profile_fn(args, generate_fn, ort_iter_inputs, "token")
 
         # Turn profiling off to stop appending to log
         old_logname = model.end_profiling()
@@ -371,9 +515,9 @@ def without_io_binding(inputs):
 
 
 def run_inference(args, init_inputs, iter_inputs, model):
-    if args.benchmark_type in {"hf-pt", "hf-pt2", "hf-ort"}:
+    if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile", "hf-ort"}:
         run_hf_inference(args, init_inputs, iter_inputs, model)
-    elif args.benchmark_type == "ort":
+    elif args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}:
         run_ort_inference(args, init_inputs, iter_inputs, model)
     else:
         raise Exception(f"Cannot recognize {args.benchmark_type}")
@@ -382,7 +526,11 @@ def run_inference(args, init_inputs, iter_inputs, model):
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-bt", "--benchmark-type", type=str, required=True, choices=["hf-pt", "hf-pt2", "hf-ort", "ort"]
+        "-bt",
+        "--benchmark-type",
+        type=str,
+        required=True,
+        choices=["hf-pt-eager", "hf-pt-compile", "hf-ort", "ort-msft", "ort-convert-to-onnx"],
     )
     parser.add_argument(
         "-m",
@@ -402,20 +550,20 @@ def get_args():
         required=True,
         type=str,
         default="fp32",
-        choices=["int8", "fp16", "fp32"],
+        choices=["int4", "int8", "fp16", "fp32"],
         help="Precision for model. For ONNX models, the model's precision should be set before running this script.",
     )
     parser.add_argument(
-        "--hf-pt-model-path",
+        "--hf-pt-dir-path",
         type=str,
         default="",
         help="Path to directory containing all PyTorch files (e.g. tokenizer, PyTorch model)",
     )
     parser.add_argument(
-        "--hf-ort-model-path",
+        "--hf-ort-dir-path",
         type=str,
         default="",
-        help="Path to directory containing all ONNX files (e.g. tokenizer, encoder, decoder, decoder_with_past)",
+        help="Path to directory containing all ONNX files (e.g. tokenizer, decoder_merged, decoder, decoder_with_past)",
     )
     parser.add_argument(
         "--ort-model-path",
@@ -475,15 +623,20 @@ def get_args():
             args.execution_provider = (args.execution_provider, {"device_id": args.device_id})
             args.device = "cuda"
 
-    # Check that model paths have been specified for any benchmarking with ORT
+    # Check that paths have been specified for any benchmarking with ORT
     if args.benchmark_type == "hf-ort":
-        assert args.hf_ort_model_path, "Please specify a path to `--hf-ort-model-path`"
-    if args.benchmark_type == "ort":
+        assert args.hf_ort_dir_path, "Please specify a path to `--hf-ort-dir-path`"
+    if args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}:
         assert args.ort_model_path, "Please specify a path to `--ort-model-path`"
 
     args.batch_sizes = args.batch_sizes.split(" ")
     args.sequence_lengths = args.sequence_lengths.split(" ")
 
+    # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
+    args.precision = (
+        "fp32" if args.precision in {"int8", "fp32"} or (args.precision == "int4" and args.device == "cpu") else "fp16"
+    )
+
     # Check that only one (batch_size, sequence_length) combination is set for profiling
     if args.profile:
         assert (
@@ -509,14 +662,27 @@ def main():
     setattr(args, "target_device", target_device)  # noqa: B010
     setattr(args, "use_fp16", use_fp16)  # noqa: B010
 
-    # Measure prompt cost (init_inputs) and generated token cost (iter_inputs)
+    # Get model and model info
     model = get_model(args)
+    ort_model_inputs_len = get_ort_model_inputs_len(args, model)
+
+    # Check if past_present_share_buffer can be enabled (only for FP16 models with GQA)
+    if args.benchmark_type in {"ort-convert-to-onnx", "ort-msft"}:
+        onnx_model = onnx.load_model(args.ort_model_path, load_external_data=False)
+        gqa_nodes = list(filter(lambda node: node.op_type == "GroupQueryAttention", onnx_model.graph.node))
+
+        use_buffer_share = use_fp16 and len(gqa_nodes) > 0 and args.device != "cpu"
+        setattr(args, "past_present_share_buffer", use_buffer_share)  # noqa: B010
+    else:
+        setattr(args, "past_present_share_buffer", False)  # noqa: B010
+
+    # Measure prompt cost (init_inputs) and generated token cost (iter_inputs)
     for batch_size, sequence_length in itertools.product(args.batch_sizes, args.sequence_lengths):
         logger.info(f"\nBatch size = {batch_size} and sequence length = {sequence_length}...")
         setattr(args, "batch_size", int(batch_size))  # noqa: B010
         setattr(args, "sequence_length", int(sequence_length))  # noqa: B010
 
-        init_inputs, iter_inputs = get_inputs(args)
+        init_inputs, iter_inputs = get_inputs(args, ort_model_inputs_len)
         run_inference(args, init_inputs, iter_inputs, model)
 
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
index 7199c945fe6ba..951b2549368f7 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
@@ -43,15 +43,38 @@ def get_args():
     )
 
     parser.add_argument(
-        "--hf-ort-model-path",
+        "--hf-pt-eager",
+        default=False,
+        action="store_true",
+        help="Benchmark in PyTorch without `torch.compile`",
+    )
+
+    parser.add_argument(
+        "--hf-pt-compile",
+        default=False,
+        action="store_true",
+        help="Benchmark in PyTorch with `torch.compile`",
+    )
+
+    parser.add_argument(
+        "--hf-ort-dir-path",
         type=str,
+        default="",
         help="Path to folder containing ONNX models for Optimum + ORT benchmarking",
     )
 
     parser.add_argument(
-        "--ort-model-path",
+        "--ort-msft-model-path",
+        type=str,
+        default="",
+        help="Path to ONNX model from https://github.com/microsoft/Llama-2-Onnx",
+    )
+
+    parser.add_argument(
+        "--ort-convert-to-onnx-model-path",
         type=str,
-        help="Path to ONNX model for ORT benchmarking",
+        default="",
+        help="Path to ONNX model from convert_to_onnx",
     )
 
     parser.add_argument(
@@ -65,7 +88,7 @@ def get_args():
         "--precision",
         type=str,
         required=True,
-        choices=["int8", "fp16", "fp32"],
+        choices=["int4", "int8", "fp16", "fp32"],
         help="Precision to run model",
     )
 
@@ -138,8 +161,6 @@ def process_log_file(device_id, log_file, base_results):
                 step = "per-token"
             elif latency_pattern in line:
                 latency_s = float(line[len(latency_pattern) : line.rfind(" ")])
-                if step == "prompt":
-                    latency_s /= sequence_length
                 latency_ms = latency_s * 1000
             elif throughput_pattern in line:
                 throughput = float(line[len(throughput_pattern) : line.rfind(" ")])
@@ -184,7 +205,7 @@ def save_results(results, filename):
             "Step",
             "Latency (s)",
             "Latency (ms)",
-            "Throughput (qps)",
+            "Throughput (tps)",
             "Memory (GB)",
         ],
     )
@@ -194,7 +215,7 @@ def save_results(results, filename):
     df["Sequence Length"] = df["Sequence Length"].astype("int")
     df["Latency (s)"] = df["Latency (s)"].astype("float")
     df["Latency (ms)"] = df["Latency (ms)"].astype("float")
-    df["Throughput (qps)"] = df["Throughput (qps)"].astype("float")
+    df["Throughput (tps)"] = df["Throughput (tps)"].astype("float")
     df["Memory (GB)"] = df["Memory (GB)"].astype("float")
 
     df.to_csv(filename, index=False)
@@ -226,75 +247,81 @@ def main():
     torch.backends.cudnn.benchmark = True
 
     all_results = []
+
     # Benchmark PyTorch without torch.compile
-    benchmark_cmd = [
-        "python3",
-        "benchmark.py",
-        "--benchmark-type",
-        "hf-pt",
-        "--model-name",
-        args.model_name,
-        "--precision",
-        args.precision,
-        "--batch-sizes",
-        args.batch_sizes,
-        "--sequence-lengths",
-        args.sequence_lengths,
-        "--device",
-        args.device,
-        "--device-id",
-        str(args.device_id),
-        "--warmup-runs",
-        str(args.warmup_runs),
-        "--num-runs",
-        str(args.num_runs),
-        "--log-folder",
-        args.log_folder,
-        "--auth",
-    ]
-    logger.info("Benchmark PyTorch without torch.compile")
-    results = benchmark(args, benchmark_cmd, "pytorch")
-    all_results.extend(results)
+    if args.hf_pt_eager:
+        benchmark_cmd = [
+            "python",
+            "-m",
+            "models.llama.benchmark",
+            "--benchmark-type",
+            "hf-pt-eager",
+            "--model-name",
+            args.model_name,
+            "--precision",
+            args.precision,
+            "--batch-sizes",
+            args.batch_sizes,
+            "--sequence-lengths",
+            args.sequence_lengths,
+            "--device",
+            args.device,
+            "--device-id",
+            str(args.device_id),
+            "--warmup-runs",
+            str(args.warmup_runs),
+            "--num-runs",
+            str(args.num_runs),
+            "--log-folder",
+            args.log_folder,
+            "--auth",
+        ]
+        logger.info("Benchmark PyTorch without torch.compile")
+        results = benchmark(args, benchmark_cmd, "pytorch-eager")
+        all_results.extend(results)
 
     # Benchmark PyTorch with torch.compile
-    benchmark_cmd = [
-        "python3",
-        "benchmark.py",
-        "--benchmark-type",
-        "hf-pt2",
-        "--model-name",
-        args.model_name,
-        "--precision",
-        args.precision,
-        "--batch-sizes",
-        args.batch_sizes,
-        "--sequence-lengths",
-        args.sequence_lengths,
-        "--device",
-        args.device,
-        "--device-id",
-        str(args.device_id),
-        "--warmup-runs",
-        str(args.warmup_runs),
-        "--num-runs",
-        str(args.num_runs),
-        "--log-folder",
-        args.log_folder,
-        "--auth",
-    ]
-    logger.info("Benchmark PyTorch with torch.compile")
-    results = benchmark(args, benchmark_cmd, "pytorch-2")
-    all_results.extend(results)
+    if args.hf_pt_compile:
+        benchmark_cmd = [
+            "python",
+            "-m",
+            "models.llama.benchmark",
+            "--benchmark-type",
+            "hf-pt-compile",
+            "--model-name",
+            args.model_name,
+            "--precision",
+            args.precision,
+            "--batch-sizes",
+            args.batch_sizes,
+            "--sequence-lengths",
+            args.sequence_lengths,
+            "--device",
+            args.device,
+            "--device-id",
+            str(args.device_id),
+            "--warmup-runs",
+            str(args.warmup_runs),
+            "--num-runs",
+            str(args.num_runs),
+            "--log-folder",
+            args.log_folder,
+            "--auth",
+        ]
+        logger.info("Benchmark PyTorch with torch.compile")
+        results = benchmark(args, benchmark_cmd, "pytorch-compile")
+        all_results.extend(results)
 
     # Benchmark Optimum + ONNX Runtime
-    if args.hf_ort_model_path:
+    if args.hf_ort_dir_path:
         benchmark_cmd = [
-            "python3",
-            "benchmark.py",
+            "python",
+            "-m",
+            "models.llama.benchmark",
             "--benchmark-type",
             "hf-ort",
-            "--hf-ort-model-path",
-            args.hf_ort_model_path,
+            "--hf-ort-dir-path",
+            args.hf_ort_dir_path,
             "--model-name",
             args.model_name,
             "--precision",
@@ -316,18 +343,52 @@ def main():
             "--auth",
         ]
         logger.info("Benchmark Optimum + ONNX Runtime")
-        results = benchmark(args, benchmark_cmd, "pytorch-ort")
+        results = benchmark(args, benchmark_cmd, "optimum-ort")
+        all_results.extend(results)
+
+    # Benchmark Microsoft model in ONNX Runtime
+    if args.ort_msft_model_path:
+        benchmark_cmd = [
+            "python",
+            "-m",
+            "models.llama.benchmark",
+            "--benchmark-type",
+            "ort-msft",
+            "--ort-model-path",
+            args.ort_msft_model_path,
+            "--model-name",
+            args.model_name,
+            "--precision",
+            args.precision,
+            "--batch-sizes",
+            args.batch_sizes,
+            "--sequence-lengths",
+            args.sequence_lengths,
+            "--device",
+            args.device,
+            "--device-id",
+            str(args.device_id),
+            "--warmup-runs",
+            str(args.warmup_runs),
+            "--num-runs",
+            str(args.num_runs),
+            "--log-folder",
+            args.log_folder,
+        ]
+        logger.info("Benchmark Microsoft model in ONNX Runtime")
+        results = benchmark(args, benchmark_cmd, "ort-msft")
         all_results.extend(results)
 
-    # Benchmark ONNX Runtime
-    if args.ort_model_path:
+    # Benchmark convert_to_onnx model in ONNX Runtime
+    if args.ort_convert_to_onnx_model_path:
         benchmark_cmd = [
-            "python3",
-            "benchmark.py",
+            "python",
+            "-m",
+            "models.llama.benchmark",
             "--benchmark-type",
-            "ort",
+            "ort-convert-to-onnx",
             "--ort-model-path",
-            args.ort_model_path,
+            args.ort_convert_to_onnx_model_path,
             "--model-name",
             args.model_name,
             "--precision",
@@ -347,7 +408,7 @@ def main():
             "--log-folder",
             args.log_folder,
         ]
-        logger.info("Benchmark ONNX Runtime")
+        logger.info("Benchmark convert_to_onnx model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "onnxruntime")
         all_results.extend(results)
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index f96347ba67aa6..61d71bc38f4e9 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -8,12 +8,16 @@
 import onnx
 import torch
 from benchmark_helper import Precision, prepare_environment, setup_logger
-from llama_inputs import get_sample_inputs, get_sample_with_past_kv_inputs
+from convert_generation import replace_mha_with_gqa
+from llama_inputs import get_merged_sample_with_past_kv_inputs, get_sample_inputs, get_sample_with_past_kv_inputs
 from llama_parity import main as parity_check
 from onnx_model import OnnxModel
+from optimizer import optimize_model
+from packaging import version
 from transformers import LlamaConfig, LlamaForCausalLM
 
 from onnxruntime import quantization as ort_quantization
+from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
 
 logger = logging.getLogger("")
 
@@ -58,6 +62,33 @@ def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: Li
     return dynamic_axes
 
 
+def get_merged_model_dynamic_axes(input_names: List[str], output_names: List[str]):
+    dynamic_axes = {}
+    for name in input_names + output_names:
+        if name in {"input_ids", "position_ids"}:
+            # shape is (batch_size, sequence_length)
+            dynamic_axes[name] = {0: "batch_size", 1: "sequence_length"}
+        elif name == "attention_mask":
+            # shape is (batch_size, past_sequence_length + sequence_length) = (batch_size, total_sequence_length)
+            # for prompt generation, past_sequence_length = 0
+            # for token generation, sequence_length = 1
+            dynamic_axes[name] = {0: "batch_size", 1: "total_sequence_length"}
+        elif "past" in name:
+            # shape is (batch_size, num_heads, past_sequence_length, head_size)
+            dynamic_axes[name] = {0: "batch_size", 2: "past_sequence_length"}
+        elif name == "logits":
+            # shape is (batch_size, sequence_length, vocab_size)
+            dynamic_axes[name] = {0: "batch_size", 1: "sequence_length"}
+        elif "present" in name:
+            # shape is (batch_size, num_heads, past_sequence_length + sequence_length, head_size) = (batch_size, num_heads, total_sequence_length, head_size)
+            # for prompt generation, past_sequence_length = 0
+            # for token generation, sequence_length = 1
+            dynamic_axes[name] = {0: "batch_size", 2: "total_sequence_length"}
+        else:
+            raise Exception("Unknown input or output name found")
+    return dynamic_axes
+
+
 def save_onnx_model(onnx_model: onnx.ModelProto, output_path: str, data_path: str):
     onnx.save(
         onnx_model,
@@ -152,7 +183,7 @@ def run_dynamo_export(args: argparse.Namespace, l_config: LlamaConfig, llama: Ll
     logger.info(f"The {args.model_name} ONNX model has been successfully created with the Dynamo exporter!")
 
 
-def run_torchscript_export(args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM):
+def run_torchscript_separate_export(args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM):
     # Dummy values for export
     batch_size, sequence_length = 2, 8
     device = torch.device("cpu")
@@ -248,12 +279,206 @@ def run_torchscript_export(args: argparse.Namespace, l_config: LlamaConfig, llam
     logger.info(f"The {args.model_name} ONNX model has been successfully created with the TorchScript exporter!")
 
 
+def run_torchscript_merged_export(args: argparse.Namespace, l_config: LlamaConfig, llama: LlamaForCausalLM):
+    # Dummy values for export
+    batch_size, sequence_length, past_sequence_length = 2, 8, 0
+    device = torch.device("cpu")
+
+    # Export decoder_merged_model.onnx
+    decoder_merged_inputs = get_merged_sample_with_past_kv_inputs(
+        l_config, device, batch_size, sequence_length, past_sequence_length
+    )
+    input_names = [
+        "input_ids",
+        "attention_mask",
+        "position_ids",
+        *list(
+            chain.from_iterable(
+                (f"past_key_values.{i}.key", f"past_key_values.{i}.value") for i in range(l_config.num_hidden_layers)
+            )
+        ),
+    ]
+    output_names = [
+        "logits",
+        *list(
+            chain.from_iterable((f"present.{i}.key", f"present.{i}.value") for i in range(l_config.num_hidden_layers))
+        ),
+    ]
+    dynamic_axes = get_merged_model_dynamic_axes(input_names, output_names)
+    temp_dir = tempfile.TemporaryDirectory()
+    temp_path = os.path.join(temp_dir.name, "temp.onnx")
+    torch.onnx.export(
+        llama,
+        args=decoder_merged_inputs,
+        f=temp_path,
+        export_params=True,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes=dynamic_axes,
+        opset_version=13,
+        do_constant_folding=True,
+        verbose=args.verbose,
+    )
+
+    # Check decoder_merged_model.onnx and save all external data to one file
+    onnx.checker.check_model(temp_path)
+    onnx.shape_inference.infer_shapes_path(temp_path)
+
+    output_path = os.path.join(args.output, f"{args.model_name}_decoder_merged_model_fp32.onnx")
+    onnx_model = onnx.load_model(temp_path, load_external_data=True)
+    save_onnx_model(
+        onnx_model,
+        output_path,
+        f"{args.model_name}_decoder_merged_model_fp32.onnx.data",
+    )
+    del onnx_model
+    temp_dir.cleanup()
+
+    logger.info(f"The {args.model_name} ONNX model has been successfully created with the TorchScript exporter!")
+
+
+# Optimize the model as FP32
+def optimize_export(config: LlamaConfig, input_path: str, output_path: str):
+    from fusion_options import FusionOptions
+
+    optimization_options = FusionOptions("gpt2")
+
+    model_opt = optimize_model(
+        input_path,
+        model_type="gpt2",
+        num_heads=config.num_attention_heads,
+        hidden_size=config.hidden_size,
+        opt_level=0,
+        optimization_options=optimization_options,
+        only_onnxruntime=False,
+    )
+    model_opt.save_model_to_file(output_path, use_external_data_format=True)
+    logger.info(f"The ONNX model at {input_path} has been successfully optimized and saved at {output_path}!")
+    remove_existing_model(input_path)
+
+
+def convert_to_float16(args: argparse.Namespace, config: LlamaConfig, old_paths: List[str]):
+    decoder_model_fp16_path = os.path.join(args.output, f"{args.model_name}_decoder_model_fp16.onnx")
+    decoder_with_past_model_fp16_path = os.path.join(
+        args.output, f"{args.model_name}_decoder_with_past_model_fp16.onnx"
+    )
+    decoder_merged_model_fp16_path = os.path.join(args.output, f"{args.model_name}_decoder_merged_model_fp16.onnx")
+    new_paths = [decoder_model_fp16_path, decoder_with_past_model_fp16_path, decoder_merged_model_fp16_path]
+
+    logger.info("Converting to float16...")
+    for fp32_path, fp16_path in zip(old_paths, new_paths):
+        if os.path.exists(fp32_path):
+            model = OnnxModel(onnx.load_model(fp32_path, load_external_data=True))
+            model.convert_float_to_float16(keep_io_types=False)
+            model = use_group_query_attention(config, model)
+            model.save_model_to_file(fp16_path, use_external_data_format=True)
+            del model
+            logger.info(f"The ONNX model at {fp32_path} has been converted to float16 and saved at {fp16_path}!")
+            remove_existing_model(fp32_path)
+
+    logger.info(f"The {args.model_name} ONNX model has been successfully converted to float16!")
+    return new_paths
+
+
+def use_group_query_attention(config: LlamaConfig, fp16_model_opt: OnnxModel):
+    # Replace MultiHeadAttention with GroupQueryAttention and remove attention mask nodes
+    fp16_model_opt = replace_mha_with_gqa(fp16_model_opt, "past_sequence_length", config.num_key_value_heads)
+    fp16_model_opt.prune_graph()
+    fp16_model_opt.update_graph(allow_remove_graph_inputs=True)
+    return fp16_model_opt
+
+
+def smooth_quant(
+    args: argparse.Namespace,
+    decoder_model_fp32_path: str,
+    decoder_with_past_model_fp32_path: str,
+    decoder_model_int8_path: str,
+    decoder_with_past_model_int8_path: str,
+):
+    from neural_compressor import PostTrainingQuantConfig
+    from neural_compressor import quantization as intel_quantization
+    from neural_compressor import set_workspace
+    from onnx.external_data_helper import load_external_data_for_model
+    from quant_kv_dataloader import QuantKVDataLoader
+
+    set_workspace(args.nc_workspace)
+    quantization_config = PostTrainingQuantConfig(
+        calibration_sampling_size=[args.calibration_sampling_size],
+        recipes={
+            "optypes_to_exclude_output_quant": ["MatMul"],
+            "smooth_quant": args.smooth_quant,
+            "smooth_quant_args": {"alpha": args.smooth_quant_alpha},
+        },
+        op_type_dict={
+            "^((?!(MatMul|Gather|Conv)).)*$": {
+                "weight": {"dtype": ["fp32"]},
+                "activation": {"dtype": ["fp32"]},
+            }
+        },
+    )
+
+    # Convert decoder_model.onnx to INT8
+    decoder_model_int8 = intel_quantization.fit(
+        decoder_model_fp32_path,
+        quantization_config,
+        calib_dataloader=QuantKVDataLoader(args),
+    )
+    load_external_data_for_model(
+        decoder_model_int8._model,
+        os.path.split(decoder_model_int8._model_path)[0],
+    )
+    save_onnx_model(
+        decoder_model_int8._model,
+        decoder_model_int8_path,
+        f"{args.model_name}_decoder_model_int8.onnx.data",
+    )
+    del decoder_model_int8
+    logger.info(
+        f"The ONNX model at {decoder_model_fp32_path} has been quantized to int8 and saved at {decoder_model_int8_path}!"
+    )
+    remove_existing_model(decoder_model_fp32_path)
+
+    # Convert decoder_with_past_model.onnx to INT8
+    decoder_with_past_model_int8 = intel_quantization.fit(
+        decoder_with_past_model_fp32_path,
+        quantization_config,
+        calib_dataloader=QuantKVDataLoader(args, onnx_model_path=decoder_model_fp32_path),
+    )
+    load_external_data_for_model(
+        decoder_with_past_model_int8._model,
+        os.path.split(decoder_with_past_model_int8._model_path)[0],
+    )
+    save_onnx_model(
+        decoder_with_past_model_int8._model,
+        decoder_with_past_model_int8_path,
+        f"{args.model_name}_decoder_with_past_model_int8.onnx.data",
+    )
+    del decoder_with_past_model_int8
+    logger.info(
+        f"The ONNX model at {decoder_with_past_model_fp32_path} has been quantized to int8 and saved at {decoder_with_past_model_int8_path}!"
+    )
+    remove_existing_model(decoder_with_past_model_fp32_path)
+
+    logger.info(f"The {args.model_name} ONNX model has been successfully quantized to int8!")
+
+    logger.info(f"Removing {args.nc_workspace}")
+    os.system(f"rm -R {args.nc_workspace}")
+
+
+def remove_existing_model(model_path: str):
+    # Remove ONNX model and its external data
+    data_path = os.path.join(model_path + ".data")
+    os.remove(model_path)
+    os.remove(data_path)
+    logger.warning(f"Removed {model_path} and {data_path}")
+
+
 def remove_existing_files(output_path: str):
     for filename in os.listdir(output_path):
         filepath = os.path.join(output_path, filename)
         if ".onnx" in filename or ".onnx.data" in filename:
             os.remove(filepath)
-            logger.warning(f"Removing {filepath}")
+            logger.warning(f"Removed {filepath}")
 
 
 def get_args():
@@ -288,7 +513,7 @@ def get_args():
         required=False,
         type=Precision,
         default=Precision.FLOAT32,
-        choices=[Precision.FLOAT32, Precision.FLOAT16, Precision.INT8],
+        choices=[Precision.FLOAT32, Precision.FLOAT16, Precision.INT8, Precision.INT4],
         help="Precision to export model in",
     )
 
@@ -301,15 +526,51 @@ def get_args():
         help="Execution provider to verify parity with",
     )
 
+    parser.add_argument(
+        "-id",
+        "--device-id",
+        required=False,
+        type=str,
+        default="0",
+        help="Device ID for GPUs",
+    )
+
+    parser.add_argument(
+        "-r",
+        "--reexport",
+        required=False,
+        action="store_true",
+        help="Re-export models and overwrite existing models in output folder",
+    )
+    parser.set_defaults(reexport=False)
+
+    parser.add_argument(
+        "--no_merged",
+        required=False,
+        action="store_true",
+        help="Export models into 2 ONNX files instead of 1. Deprecated in favor of exporting into 1 ONNX file.",
+    )
+    parser.set_defaults(no_merged=False)
+
     parser.add_argument(
         "-q",
         "--quantization_method",
         default="",
-        choices=["smooth_quant", "quantize_dynamic"],
-        help="Run a specific quantization algorithm. Need to install extra packages in `requirements-quant.txt` for SmoothQuant.",
+        choices=["blockwise", "smooth_quant", "quantize_dynamic"],
+        help="Run a specific quantization algorithm (blockwise for int4, smooth_quant for int8, quantize_dynamic for int8). Blockwise is recommended. Need to install extra packages in `requirements-quant.txt` for SmoothQuant.",
     )
 
-    smooth_quant_group = parser.add_argument_group("smooth_quant")
+    blockwise_group = parser.add_argument_group("4-bit quantization")
+
+    blockwise_group.add_argument(
+        "--block_size",
+        required=False,
+        default=32,
+        type=int,
+        help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.",
+    )
+
+    smooth_quant_group = parser.add_argument_group("smooth_quant (8-bit quantization)")
 
     smooth_quant_group.add_argument(
         "--smooth_quant_alpha",
@@ -352,7 +613,7 @@ def get_args():
         help="Workspace to save intermediate files generated by Intel's Neural Compressor package.",
     )
 
-    quantize_dynamic_group = parser.add_argument_group("quantize_dynamic")
+    quantize_dynamic_group = parser.add_argument_group("quantize_dynamic (8-bit quantization)")
 
     quantize_dynamic_group.add_argument(
         "--quantize_embedding_layer",
@@ -399,177 +660,193 @@ def get_args():
 
 
 def main():
+    if version.parse(torch.__version__) < version.parse("2.2.0") and "2.2.0.dev" not in torch.__version__:
+        # Second predicate is for comparing nightly (ex: 2.2.0.dev20230920 vs 2.2.0) since first predicate is false
+        # in that scenario. It can be removed when torch v2.2.0 is released in stable.
+        logger.error(f"Detected PyTorch version {torch.__version__}. Please upgrade and use v2.2.0 or newer.")
+        return
+
     args = get_args()
     setup_logger(args.verbose)
     prepare_environment(args.input, args.output, args.execution_provider != "cpu")
-    remove_existing_files(args.output)
+    if args.reexport:
+        remove_existing_files(args.output)
     logger.info(f"Arguments: {args}")
 
     # Load model and config
     use_auth_token = args.input == os.path.join(".")
     setattr(args, "use_auth_token", use_auth_token)  # noqa: B010
-    l_config = LlamaConfig.from_pretrained(
-        args.model_name if use_auth_token else args.input, use_auth_token=use_auth_token
-    )
-    llama = LlamaForCausalLM.from_pretrained(
-        args.model_name if use_auth_token else args.input, use_auth_token=use_auth_token, use_cache=True
-    )
+
+    location = args.model_name if use_auth_token else args.input
+    l_config = LlamaConfig.from_pretrained(location, use_auth_token=use_auth_token)
+    llama = LlamaForCausalLM.from_pretrained(location, use_auth_token=use_auth_token, use_cache=True)
     original_model_name = args.model_name
     setattr(args, "original_model_name", original_model_name)  # noqa: B010
     args.model_name = args.model_name.split("/")[-1]
 
-    # Export to ONNX
-    if args.use_dynamo_export:
-        logger.warning("Please ensure you have installed PyTorch, ONNX, and ONNX Script as follows.")
-        logger.warning("Step 1 - PyTorch nightly: https://pytorch.org/get-started/locally/")
-        logger.warning("Step 2 - ONNX weekly: https://pypi.org/project/onnx-weekly/")
-        logger.warning(
-            "Step 3 - ONNX Script from source: https://github.com/microsoft/onnxscript#installing-onnx-script"
-        )
-        logger.warning(
-            "Note: After you install ONNX weekly, omit `onnx` when running the first line for installing ONNX Script. This is because you already installed `onnx-weekly` in the previous step."
-        )
-        run_dynamo_export(args, l_config, llama)
-    else:
-        run_torchscript_export(args, l_config, llama)
-
-    # Change precision of exported models if not FP32
+    # Set model paths for FP32 model
     decoder_model_fp32_path = os.path.join(args.output, f"{args.model_name}_decoder_model_fp32.onnx")
     decoder_with_past_model_fp32_path = os.path.join(
         args.output, f"{args.model_name}_decoder_with_past_model_fp32.onnx"
     )
+    decoder_merged_model_fp32_path = os.path.join(args.output, f"{args.model_name}_decoder_merged_model_fp32.onnx")
+    old_paths = [decoder_model_fp32_path, decoder_with_past_model_fp32_path, decoder_merged_model_fp32_path]
+
+    missing_separate_exports = (
+        args.no_merged
+        and not os.path.exists(decoder_model_fp32_path)
+        and not os.path.exists(decoder_with_past_model_fp32_path)
+    )
+    missing_merged_export = not args.no_merged and not os.path.exists(decoder_merged_model_fp32_path)
+
+    # Export to ONNX
+    if missing_separate_exports or missing_merged_export:
+        if args.use_dynamo_export and missing_separate_exports:
+            logger.warning("Please ensure you have installed PyTorch, ONNX, and ONNX Script as follows.")
+            logger.warning("Step 1 - PyTorch nightly: https://pytorch.org/get-started/locally/")
+            logger.warning("Step 2 - ONNX weekly: https://pypi.org/project/onnx-weekly/")
+            logger.warning(
+                "Step 3 - ONNX Script from source: https://github.com/microsoft/onnxscript#installing-onnx-script"
+            )
+            logger.warning(
+                "Note: After you install ONNX weekly, omit `onnx` when running the first line for installing ONNX Script. This is because you already installed `onnx-weekly` in the previous step."
+            )
+            run_dynamo_export(args, l_config, llama)
+        elif args.no_merged:
+            run_torchscript_separate_export(args, l_config, llama)
+        else:
+            run_torchscript_merged_export(args, l_config, llama)
+
+    # Set model paths to store FP32 optimized model
+    decoder_model_fp32_opt_path = os.path.join(args.output, f"{args.model_name}_decoder_model_fp32_opt.onnx")
+    decoder_with_past_model_fp32_opt_path = os.path.join(
+        args.output, f"{args.model_name}_decoder_with_past_model_fp32_opt.onnx"
+    )
+    decoder_merged_model_fp32_opt_path = os.path.join(
+        args.output, f"{args.model_name}_decoder_merged_model_fp32_opt.onnx"
+    )
+    new_paths = [decoder_model_fp32_opt_path, decoder_with_past_model_fp32_opt_path, decoder_merged_model_fp32_opt_path]
+
+    # Run the optimizer script
+    logger.info("Optimizing models...")
+    for orig_path, opt_path in zip(old_paths, new_paths):
+        if os.path.exists(orig_path):
+            optimize_export(l_config, input_path=orig_path, output_path=opt_path)
+
+    # Re-assign default FP32 model paths as their optimized versions
+    decoder_model_fp32_path = decoder_model_fp32_opt_path
+    decoder_with_past_model_fp32_path = decoder_with_past_model_fp32_opt_path
+    decoder_merged_model_fp32_path = decoder_merged_model_fp32_opt_path
+    old_paths = [decoder_model_fp32_path, decoder_with_past_model_fp32_path, decoder_merged_model_fp32_path]
+
+    logger.info(
+        f"The {args.model_name} ONNX model has been successfully optimized with the ORT transformer optimizer script!"
+    )
 
+    # Change precision of exported models from FP32
     if args.precision == Precision.FLOAT16:
-        # Convert decoder_model.onnx to FP16
-        decoder_model_fp16_path = os.path.join(args.output, f"{args.model_name}_decoder_model_fp16.onnx")
-        model = OnnxModel(onnx.load_model(decoder_model_fp32_path, load_external_data=True))
-        model.convert_float_to_float16(keep_io_types=False, op_block_list=["If"])
-        model.save_model_to_file(decoder_model_fp16_path, use_external_data_format=True, all_tensors_to_one_file=True)
-        del model
-
-        # Convert decoder_with_past_model.onnx to FP16
-        decoder_with_past_model_fp16_path = os.path.join(
-            args.output, f"{args.model_name}_decoder_with_past_model_fp16.onnx"
-        )
-        model = OnnxModel(onnx.load_model(decoder_with_past_model_fp32_path, load_external_data=True))
-        model.convert_float_to_float16(keep_io_types=False, op_block_list=["If"])
-        model.save_model_to_file(
-            decoder_with_past_model_fp16_path, use_external_data_format=True, all_tensors_to_one_file=True
-        )
-        del model
+        new_paths = convert_to_float16(args, l_config, old_paths)
 
     elif args.precision == Precision.INT8:
         decoder_model_int8_path = os.path.join(args.output, f"{args.model_name}_decoder_model_int8.onnx")
         decoder_with_past_model_int8_path = os.path.join(
             args.output, f"{args.model_name}_decoder_with_past_model_int8.onnx"
         )
+        decoder_merged_model_int8_path = os.path.join(args.output, f"{args.model_name}_decoder_merged_model_int8.onnx")
+        new_paths = [decoder_model_int8_path, decoder_with_past_model_int8_path, decoder_merged_model_int8_path]
 
         if args.quantization_method == "smooth_quant":
-            from neural_compressor import PostTrainingQuantConfig
-            from neural_compressor import quantization as intel_quantization
-            from neural_compressor import set_workspace
-            from onnx.external_data_helper import load_external_data_for_model
-            from quant_kv_dataloader import QuantKVDataLoader
-
-            set_workspace(args.nc_workspace)
-            quantization_config = PostTrainingQuantConfig(
-                calibration_sampling_size=[args.calibration_sampling_size],
-                recipes={
-                    "optypes_to_exclude_output_quant": ["MatMul"],
-                    "smooth_quant": args.smooth_quant,
-                    "smooth_quant_args": {"alpha": args.smooth_quant_alpha},
-                },
-                op_type_dict={
-                    "^((?!(MatMul|Gather|Conv)).)*$": {
-                        "weight": {"dtype": ["fp32"]},
-                        "activation": {"dtype": ["fp32"]},
-                    }
-                },
-            )
-
-            # Convert decoder_model.onnx to INT8
-            decoder_model_int8 = intel_quantization.fit(
-                decoder_model_fp32_path,
-                quantization_config,
-                calib_dataloader=QuantKVDataLoader(args),
-            )
-            load_external_data_for_model(
-                decoder_model_int8._model,
-                os.path.split(decoder_model_int8._model_path)[0],
-            )
-            save_onnx_model(
-                decoder_model_int8._model,
-                decoder_model_int8_path,
-                f"{args.model_name}_decoder_model_int8.onnx.data",
-            )
-            del decoder_model_int8
-
-            # Convert decoder_with_past_model.onnx to INT8
-            decoder_with_past_model_int8 = intel_quantization.fit(
-                decoder_with_past_model_fp32_path,
-                quantization_config,
-                calib_dataloader=QuantKVDataLoader(args, onnx_model_path=decoder_model_fp32_path),
-            )
-            load_external_data_for_model(
-                decoder_with_past_model_int8._model,
-                os.path.split(decoder_with_past_model_int8._model_path)[0],
-            )
-            save_onnx_model(
-                decoder_with_past_model_int8._model,
-                decoder_with_past_model_int8_path,
-                f"{args.model_name}_decoder_with_past_model_int8.onnx.data",
-            )
-            del decoder_with_past_model_int8
-
-            logger.info(f"Removing {args.nc_workspace}")
-            os.system(f"rm -R {args.nc_workspace}")
+            if not args.no_merged:
+                logger.error("SmoothQuant must be used on separately exported models")
+            else:
+                logger.info(f"Quantizing {decoder_model_fp32_path} and {decoder_with_past_model_fp32_path} to int8")
+                smooth_quant(args, old_paths[0], old_paths[1], new_paths[0], new_paths[1])
 
         elif args.quantization_method == "quantize_dynamic":
             logger.warning(
                 "The `quantize_dynamic` method is deprecated in favor of `smooth_quant` instead. Precision loss may be high with `quantize_dynamic`."
             )
 
-            # Convert decoder_model.onnx to INT8
-            ort_quantization.quantize_dynamic(
-                decoder_model_fp32_path,
-                decoder_model_int8_path,
-                op_types_to_quantize=["MatMul", "Gemm", "Gather"]
-                if args.quantize_embedding_layer
-                else ["MatMul", "Gemm"],
-                per_channel=args.quantize_per_channel,
-                reduce_range=args.quantize_reduce_range,
-                use_external_data_format=True,
-                extra_options={"MatMulConstBOnly": True},
-            )
-
-            # Convert decoder_with_past_model.onnx to INT8
-            ort_quantization.quantize_dynamic(
-                decoder_with_past_model_fp32_path,
-                decoder_with_past_model_int8_path,
-                op_types_to_quantize=["MatMul", "Gemm", "Gather"]
-                if args.quantize_embedding_layer
-                else ["MatMul", "Gemm"],
-                per_channel=args.quantize_per_channel,
-                reduce_range=args.quantize_reduce_range,
-                use_external_data_format=True,
-                extra_options={"MatMulConstBOnly": True},
-            )
+            logger.info("Quantizing to int8...")
+            for fp32_path, int8_path in zip(old_paths, new_paths):
+                if os.path.exists(fp32_path):
+                    ort_quantization.quantize_dynamic(
+                        fp32_path,
+                        int8_path,
+                        op_types_to_quantize=["MatMul", "Gemm", "Gather"]
+                        if args.quantize_embedding_layer
+                        else ["MatMul", "Gemm"],
+                        per_channel=args.quantize_per_channel,
+                        reduce_range=args.quantize_reduce_range,
+                        use_external_data_format=True,
+                        extra_options={"MatMulConstBOnly": True},
+                    )
+                    logger.info(f"The ONNX model at {fp32_path} has been quantized to int8 and saved at {int8_path}!")
+                    remove_existing_model(decoder_model_fp32_path)
+
+            logger.info(f"The {args.model_name} ONNX model has been successfully quantized to int8!")
 
         else:
             raise Exception(f"Could not recognize {args.quantization_method} as a quantization method")
 
-    # Verify parity on all saved ONNX models
+    elif args.precision == Precision.INT4:
+        if args.execution_provider != "cpu":
+            old_paths = convert_to_float16(args, l_config, old_paths)
+
+        decoder_model_int4_path = os.path.join(args.output, f"{args.model_name}_decoder_model_int4.onnx")
+        decoder_with_past_model_int4_path = os.path.join(
+            args.output, f"{args.model_name}_decoder_with_past_model_int4.onnx"
+        )
+        decoder_merged_model_int4_path = os.path.join(args.output, f"{args.model_name}_decoder_merged_model_int4.onnx")
+        new_paths = [decoder_model_int4_path, decoder_with_past_model_int4_path, decoder_merged_model_int4_path]
+
+        for fp_path, int4_path in zip(old_paths, new_paths):
+            if os.path.exists(fp_path):
+                model = onnx.load_model(fp_path, load_external_data=True)
+                quant = MatMul4BitsQuantizer(model, args.block_size, is_symmetric=True, nodes_to_exclude=[])
+                quant.process()
+                quant.model.save_model_to_file(int4_path, use_external_data_format=True)
+                del model
+                del quant
+                logger.info(f"The ONNX model at {fp_path} has been quantized to int4 and saved at {int4_path}!")
+                remove_existing_model(fp_path)
+
     del llama  # Delete LLaMA model from memory since it will be loaded again during parity check
     logger.info("Verifying parity on all ONNX models created")
+
+    # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
+    args.precision = (
+        "fp32"
+        if args.precision in {"int8", "fp32"} or (args.precision == Precision.INT4 and args.execution_provider == "cpu")
+        else "fp16"
+    )
+
+    # Verify parity on all saved ONNX models
     for filename in os.listdir(args.output):
         if ".data" in filename or ".onnx" not in filename:
             continue
 
-        precision = filename[filename.rfind("_") + 1 : filename.find(".onnx")]
-        parity_cmd = ["-m", f"{original_model_name}", "-o", f"{os.path.join(args.output, filename)}", "-fp", precision]
+        parity_cmd = [
+            "-m",
+            original_model_name,
+            "-o",
+            os.path.join(args.output, filename),
+            "-ep",
+            args.execution_provider,
+            "-id",
+            args.device_id,
+            "-fp",
+            args.precision,
+        ]
         if "with_past" in filename:
             parity_cmd.append("--use_past_kv")
-        parity_check(parity_cmd)
+        if "merged" in filename:
+            parity_cmd.append("--merged")
+
+        try:
+            parity_check(parity_cmd)
+        except Exception as e:
+            logger.warning(f"An error occurred while verifying parity: {e}", exc_info=True)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
index 6a28498a9ffc9..2652e9f0ca64e 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
@@ -4,10 +4,13 @@
 import torch
 from transformers import LlamaConfig
 
+from onnxruntime import OrtValue
+
 
 # Get position_ids from attention_mask
 def get_position_ids(attention_mask: torch.Tensor, use_past_kv: bool):
     position_ids = attention_mask.long().cumsum(-1) - 1
+    position_ids.masked_fill_(attention_mask == 0, 1)
     if use_past_kv:
         position_ids = position_ids[:, -1].unsqueeze(-1)
     return position_ids
@@ -62,11 +65,41 @@ def get_sample_with_past_kv_inputs(
     return inputs
 
 
+# Inputs for all passes with past_key_values
+def get_merged_sample_with_past_kv_inputs(
+    config: LlamaConfig,
+    device: torch.device,
+    batch_size: int,
+    seq_len: int,
+    past_seq_len: int,
+    use_fp16: bool = False,
+    return_dict: bool = False,
+):
+    input_ids = torch.randint(
+        low=0, high=config.vocab_size, size=(batch_size, seq_len), device=device, dtype=torch.int64
+    )
+    attention_mask = torch.ones(batch_size, past_seq_len + seq_len, device=device, dtype=torch.int64)
+    # position_ids is of shape (batch_size, seq_len) for prompt generation, (batch_size, 1) for token generation
+    position_ids = get_position_ids(attention_mask, use_past_kv=(past_seq_len != 0))
+    past_kv = get_sample_past_kv_inputs(config, device, batch_size, past_seq_len, use_fp16)
+
+    if not return_dict:
+        return (input_ids, attention_mask, position_ids, past_kv)
+
+    inputs = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "position_ids": position_ids,
+        "past_key_values": past_kv,
+    }
+    return inputs
+
+
 # Create past_key_values
 def get_sample_past_kv_inputs(
     config: LlamaConfig, device: torch.device, batch_size: int, past_seq_len: int, use_fp16: bool
 ):
-    num_heads, head_size = config.num_attention_heads, config.hidden_size // config.num_attention_heads
+    num_heads, head_size = config.num_key_value_heads, config.hidden_size // config.num_key_value_heads
     torch_dtype = torch.float16 if use_fp16 else torch.float32
     past_kv = [
         (
@@ -89,31 +122,83 @@ def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tenso
 
 
 # Format PyTorch inputs to ONNX Runtime inputs
-def convert_inputs_for_ort(pt_inputs: dict, use_fp16: bool):
+def convert_inputs_for_ort(
+    pt_inputs: dict,
+    use_fp16: bool,
+    use_buffer_share: bool = False,
+    past_seq_len: int = 0,
+    max_seq_len: int = 2048,
+    device: str = "",
+    device_id: int = -1,
+):
     ort_inputs = {}
     for k, v in pt_inputs.items():
-        if k == "past_key_values":
+        if isinstance(v, np.ndarray):
+            ort_inputs[k] = v
+        elif k == "past_key_values":
             ort_inputs.update(flatten_past_kv_inputs(v, use_fp16))
+        elif k == "attention_mask" and use_fp16 and use_buffer_share:
+            # Skip because FP16 model has GroupQueryAttention, uses buffer sharing,
+            # and GQA supports a causal mask by default
+
+            # Instead, add the past sequence length input for GQA
+            ort_inputs["past_sequence_length"] = np.array([past_seq_len], dtype=np.int64)
         else:
             ort_inputs[k] = v.detach().cpu().numpy()
+
+    # Enable past-present-share-buffer by using device memory directly
+    if use_buffer_share and device != "" and device != "cpu" and device_id > -1:
+        for k, v in ort_inputs.items():
+            new_v = v
+            # Allocate new buffers with max_sequence_length for GQA
+            if "cache" in k or "past_key_values" in k:
+                # Copy v (BxSxPxH) into new_v (BxSxMxH)
+                batch_size, num_heads, _, head_size = v.shape
+                new_v = np.zeros((batch_size, num_heads, max_seq_len, head_size), dtype=v.dtype)
+                new_v[:batch_size, :num_heads, :past_seq_len, :head_size] = v
+            ort_inputs[k] = OrtValue.ortvalue_from_numpy(new_v, device_type=device, device_id=device_id)
+
     return ort_inputs
 
 
 # Inputs for Microsoft export from https://github.com/microsoft/Llama-2-Onnx
-def get_msft_sample_inputs(config: LlamaConfig, batch_size: int, past_seq_len: int, seq_len: int, use_fp16: bool):
+def get_msft_sample_inputs(
+    config: LlamaConfig, batch_size: int, past_seq_len: int, seq_len: int, use_fp16: bool, split_kv: bool
+):
     np_dtype = np.float16 if use_fp16 else np.float32
     head_size = config.hidden_size // config.num_attention_heads
     max_seq_len = 2048
 
-    ort_inputs = {
-        "x": np.random.rand(batch_size, seq_len, config.hidden_size).astype(np_dtype),
-        "attn_mask": (-10000.0 * np.triu(np.ones((batch_size, max_seq_len, max_seq_len)), k=1)).astype(np_dtype),
-        "k_cache": np.random.rand(
-            batch_size, config.num_hidden_layers, past_seq_len, config.num_attention_heads, head_size
-        ).astype(np_dtype),
-        "v_cache": np.random.rand(
-            batch_size, config.num_hidden_layers, past_seq_len, config.num_attention_heads, head_size
-        ).astype(np_dtype),
-        "pos": np.array(past_seq_len, dtype=np.int64),
-    }
+    if not split_kv:
+        ort_inputs = {
+            "x": np.random.rand(batch_size, seq_len, config.hidden_size).astype(np_dtype),
+            "attn_mask": (-10000.0 * np.triu(np.ones((batch_size, max_seq_len, max_seq_len)), k=1)).astype(np_dtype),
+            "k_cache": np.random.rand(
+                batch_size, config.num_hidden_layers, past_seq_len, config.num_attention_heads, head_size
+            ).astype(np_dtype),
+            "v_cache": np.random.rand(
+                batch_size, config.num_hidden_layers, past_seq_len, config.num_attention_heads, head_size
+            ).astype(np_dtype),
+            "pos": np.array(past_seq_len, dtype=np.int64),
+        }
+    else:
+        ort_inputs = {
+            "x": np.random.rand(batch_size, seq_len, config.hidden_size).astype(np_dtype),
+            "attn_mask": (np.triu(np.ones((batch_size, max_seq_len, max_seq_len), dtype=np.int32), k=1) - 1).astype(
+                np.int32
+            ),
+            "pos": np.array(past_seq_len, dtype=np.int64),
+        }
+        for i in range(config.num_hidden_layers):
+            ort_inputs.update(
+                {
+                    f"k_{i}_cache": np.random.rand(
+                        batch_size, config.num_attention_heads, past_seq_len, head_size
+                    ).astype(np_dtype),
+                    f"v_{i}_cache": np.random.rand(
+                        batch_size, config.num_attention_heads, past_seq_len, head_size
+                    ).astype(np_dtype),
+                }
+            )
+
     return ort_inputs
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
index dadf394440c9a..6bfcb9b4f290d 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@@ -1,44 +1,143 @@
 import argparse
 import logging
 import os
+import time
 from typing import List
 
 import numpy as np
 import torch
-from benchmark_helper import create_onnxruntime_session, setup_logger
-from llama_inputs import convert_inputs_for_ort, get_sample_inputs, get_sample_with_past_kv_inputs
+from benchmark_helper import setup_logger
+from llama_inputs import (
+    convert_inputs_for_ort,
+    get_merged_sample_with_past_kv_inputs,
+    get_sample_inputs,
+    get_sample_with_past_kv_inputs,
+)
 from transformers import LlamaConfig, LlamaForCausalLM
 
+import onnxruntime as ort
+
 logger = logging.getLogger("")
 
 
-def verify_parity(args: argparse.Namespace, config: LlamaConfig, pt_model: LlamaForCausalLM):
+def get_sequence_lengths(args: argparse.Namespace):
+    past_sequence_length, curr_sequence_length = (8, 1) if args.use_past_kv else (0, 8)
+    max_sequence_length = 2048
+    return past_sequence_length, curr_sequence_length, max_sequence_length
+
+
+def get_inputs(args: argparse.Namespace, config: LlamaConfig):
     # Dummy values for parity
-    batch_size, sequence_length = 2, 8
-    device = torch.device("cpu")
+    batch_size = 2
+    past_sequence_length, sequence_length, _ = get_sequence_lengths(args)
 
-    # Run inference with PyTorch
-    inputs = (
-        get_sample_inputs(config, device, batch_size, sequence_length, return_dict=True)
-        if not args.use_past_kv
-        else get_sample_with_past_kv_inputs(
-            config, device, batch_size, sequence_length, use_fp16=(args.precision == "fp16"), return_dict=True
+    if args.merged:
+        inputs = get_merged_sample_with_past_kv_inputs(
+            config,
+            args.device,
+            batch_size,
+            sequence_length,
+            past_sequence_length,
+            use_fp16=args.use_fp16,
+            return_dict=True,
         )
-    )
+    elif args.use_past_kv:
+        inputs = get_sample_with_past_kv_inputs(
+            config, args.device, batch_size, sequence_length, use_fp16=args.use_fp16, return_dict=True
+        )
+    else:
+        inputs = get_sample_inputs(config, args.device, batch_size, sequence_length, return_dict=True)
+
+    return inputs
+
+
+def add_io_bindings(args: argparse.Namespace, model: ort.InferenceSession, inputs: dict):
+    # Add IO bindings for non-CPU execution providers
+    io_binding = model.io_binding()
+
+    for k, v in inputs.items():
+        if args.use_fp16:
+            # Bind all OrtValue inputs to device
+            io_binding.bind_ortvalue_input(k, v)
+        else:
+            io_binding.bind_cpu_input(k, v)
+
+    for output in model.get_outputs():
+        name = output.name
+        if args.use_fp16 and ("out" in name or "present" in name):
+            # Bind present KV cache outputs to OrtValue with buffer sharing
+            io_binding.bind_ortvalue_output(
+                name, inputs[name.replace("out", "cache").replace("present", "past_key_values")]
+            )
+        else:
+            io_binding.bind_output(name, device_type=args.execution_provider, device_id=int(args.device_id))
+
+    return io_binding
+
+
+def verify_parity(args: argparse.Namespace, config: LlamaConfig, pt_model: LlamaForCausalLM):
+    inputs = get_inputs(args, config)
+
+    # Run inference with PyTorch
+    if args.execution_provider != "cpu":
+        torch.cuda.synchronize()
+    start_time = time.time()
     pt_outputs = pt_model(**inputs).logits.detach().cpu().numpy()
+    if args.execution_provider != "cpu":
+        torch.cuda.synchronize()
+    end_time = time.time()
+    logger.info(f"PyTorch took {end_time - start_time} s")
 
     # Run inference with ORT
-    inputs = convert_inputs_for_ort(inputs, use_fp16=(args.precision == "fp16"))
-    ort_model = create_onnxruntime_session(
+    past_sequence_length, _, max_sequence_length = get_sequence_lengths(args)
+    inputs = convert_inputs_for_ort(
+        inputs,
+        use_fp16=args.use_fp16,
+        use_buffer_share=args.use_fp16,
+        past_seq_len=past_sequence_length,
+        max_seq_len=max_sequence_length,
+        device=args.execution_provider,
+        device_id=int(args.device_id),
+    )
+
+    ep = f"{args.execution_provider.upper()}ExecutionProvider"
+    if ep == "CUDAExecutionProvider":
+        ep = (ep, {"device_id": args.device_id})
+    ort_model = ort.InferenceSession(
         args.onnx_model_path,
-        args.execution_provider != "cpu",  # use_gpu
-        provider=args.execution_provider,
-        verbose=args.verbose,
+        sess_options=ort.SessionOptions(),
+        providers=[ep],
     )
-    ort_outputs = ort_model.run(None, inputs)[0]
+
+    # Add IO bindings for non-CPU execution providers
+    if args.execution_provider != "cpu":
+        io_binding = add_io_bindings(args, ort_model, inputs)
+
+        torch.cuda.synchronize()
+        start_time = time.time()
+        ort_model.run_with_iobinding(io_binding)
+        torch.cuda.synchronize()
+        end_time = time.time()
+
+        ort_outputs = io_binding.copy_outputs_to_cpu()[0]  # Get logits
+
+    else:
+        start_time = time.time()
+        ort_outputs = ort_model.run(None, inputs)
+        end_time = time.time()
+
+        ort_outputs = ort_outputs[0]  # Get logits
+
+    logger.info(f"ONNX Runtime took {end_time - start_time} s")
 
     # Compare PyTorch and ONNX Runtime accuracy
-    tol = 1e-3 if args.precision == "fp32" else 1e-2 if args.precision == "fp16" else 1e2
+    tol = (
+        2e1
+        if "int4" in args.onnx_model_path or "int8" in args.onnx_model_path
+        else 1e-3
+        if args.precision == "fp32"
+        else 5e-1
+    )
     parity = np.allclose(pt_outputs, ort_outputs, rtol=tol, atol=tol)
     logger.warning(f"Are PyTorch and ONNX Runtime results close? {parity}")
     if not parity:
@@ -80,6 +179,15 @@ def get_args(argv: List[str]):
         help="Execution provider to verify parity with",
     )
 
+    parser.add_argument(
+        "-id",
+        "--device-id",
+        required=False,
+        type=str,
+        default="0",
+        help="Device ID for GPUs",
+    )
+
     parser.add_argument(
         "-v",
         "--verbose",
@@ -96,15 +204,29 @@ def get_args(argv: List[str]):
     )
     parser.set_defaults(use_past_kv=False)
 
+    parser.add_argument(
+        "--merged",
+        action="store_true",
+        help="Use merged model (i.e. decoder_merged_model.onnx).",
+    )
+    parser.set_defaults(merged=False)
+
     parser.add_argument(
         "-fp",
         "--precision",
         required=True,
-        choices=["int8", "fp16", "fp32"],
+        choices=["int4", "int8", "fp16", "fp32"],
         help="Precision of model",
     )
 
     args = parser.parse_args() if argv == [] else parser.parse_args(argv)
+
+    # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
+    args.precision = (
+        "fp32"
+        if args.precision in {"int8", "fp32"} or (args.precision == "int4" and args.execution_provider == "cpu")
+        else "fp16"
+    )
     return args
 
 
@@ -114,19 +236,34 @@ def main(argv: List[str] = []):  # noqa: B006
     logger.info(f"Arguments: {args}")
 
     # Load model and config
+    setattr(args, "use_fp16", args.precision == "fp16")  # noqa: B010
+    setattr(args, "device_name", "cpu" if args.execution_provider == "cpu" else f"cuda:{args.device_id}")  # noqa: B010
+    setattr(args, "device", torch.device(args.device_name))  # noqa: B010
     use_auth_token = args.torch_model_directory == os.path.join(".")
     location = args.model_name if use_auth_token else args.torch_model_directory
 
     config = LlamaConfig.from_pretrained(location, use_auth_token=use_auth_token)
     llama = LlamaForCausalLM.from_pretrained(
         location,
-        torch_dtype=(torch.float16 if args.precision == "fp16" else torch.float32),
+        torch_dtype=(torch.float16 if args.use_fp16 else torch.float32),
         use_auth_token=use_auth_token,
         use_cache=True,
-    )
+    ).to(args.device)
+
+    if not args.merged:
+        verify_parity(args, config, llama)
+    else:
+        # Verify prompt generation in merged model (decoder_model.onnx)
+        args.use_past_kv = False
+        verify_parity(args, config, llama)
 
-    verify_parity(args, config, llama)
+        # Verify token generation in merged model (decoder_with_past_model.onnx)
+        args.use_past_kv = True
+        verify_parity(args, config, llama)
 
 
 if __name__ == "__main__":
+    seed = 2
+    np.random.seed(seed)
+    torch.manual_seed(seed)
     main()
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-cpu.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-cpu.txt
index e9ad937cf14e7..e06c3ada834b0 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements-cpu.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements-cpu.txt
@@ -1,3 +1,2 @@
 -r requirements.txt
-torch>=2.0.1
-onnxruntime>=1.16.0
\ No newline at end of file
+onnxruntime>=1.17.0
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
index 5544abcaa1228..773680937bd21 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt
@@ -1,4 +1,4 @@
 -r requirements.txt
-# Please manually install torch>=2.0.1 with CUDA enabled for the CUDA version installed in your system.
+# Please manually install torch>=2.2.0.dev20230920 with CUDA enabled for the CUDA version installed in your system.
 # Instructions can be found here: https://pytorch.org/get-started/locally/
-onnxruntime-gpu>=1.16.0
\ No newline at end of file
+onnxruntime-gpu>=1.17.0
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
index f843ef4dc5568..4210f36982aef 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
@@ -1,5 +1,6 @@
-git+https://github.com/kunal-vaishnavi/optimum.git@kvaishnavi/llama-add-position-ids
-transformers>=4.28.1
+git+https://github.com/huggingface/optimum.git
+transformers>=4.33.2
+torch>=2.2.0.dev20230920
 onnx>=1.14.0
 datasets>=2.8.0
 protobuf==3.20.2
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/whisper/README.md b/onnxruntime/python/tools/transformers/models/whisper/README.md
index e9365becd2cd1..8ff5c8a6e1de0 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/README.md
+++ b/onnxruntime/python/tools/transformers/models/whisper/README.md
@@ -79,24 +79,22 @@ $ python3 -m onnxruntime.transformers.models.whisper.convert_to_onnx -m openai/w
 
 Here are some examples of how you can benchmark Whisper across various end-to-end (E2E) implementations.
 
-Note: In the below examples, `PyTorch` refers to running in PyTorch without `torch.compile` and `PyTorch 2.0` refers to running in PyTorch with `torch.compile`.
-
 ### Variants
 
-1. PyTorch (without `torch.compile`), FP32
+1. PyTorch without `torch.compile`, FP32
 ```
 python3 -m models.whisper.benchmark \
-    --benchmark-type hf-pt \
+    --benchmark-type hf-pt-eager \
     --audio-path 1272-141231-0002.mp3 \
     --model-name openai/whisper-large-v2 \
     --precision fp32 \
     --device cpu
 ```
 
-2. PyTorch 2.0 (with `torch.compile`), FP16
+2. PyTorch with `torch.compile`, FP16
 ```
 python3 -m models.whisper.benchmark \
-    --benchmark-type hf-pt2 \
+    --benchmark-type hf-pt-compile \
     --audio-path 1272-141231-0002.mp3 \
     --model-name openai/whisper-large-v2 \
     --precision fp16 \
@@ -109,7 +107,7 @@ python3 -m models.whisper.benchmark \
     --benchmark-type hf-ort \
     --audio-path 1272-141231-0002.mp3 \
     --model-name openai/whisper-large-v2 \
-    --hf-ort-model-path ./whisper-large-v2-onnx/ \
+    --hf-ort-dir-path ./whisper-large-v2-onnx/ \
     --precision fp32 \
     --device cpu
 ```
@@ -156,7 +154,9 @@ You can use `benchmark_all.py` to benchmark across various platforms and automat
 ```
 python3 -m models.whisper.benchmark_all \
     --audio-path ./whisper-test-audios/ \
-    --hf-ort-model-path ./whisper-large-v2-onnx/ \
+    --hf-pt-eager \
+    --hf-pt-compile \
+    --hf-ort-dir-path ./whisper-large-v2-onnx/ \
     --ort-model-path ./wlarge-fp32/whisper-large-v2_all.onnx \
     --model-name openai/whisper-large-v2 \
     --precision fp32 \
@@ -169,28 +169,28 @@ Here is a benchmark for an MP3 file with 20.7s of audio.
 
 #### FP16
 
-| Engine        | Size     | Per-Token Latency | Real-Time Factor |
-| ------------- | -------- | ----------------- | ---------------- |
-| PyTorch       | Tiny     | 4.697 ms/token    | 0.004697         |
-| PyTorch 2.0   | Tiny     | 3.406 ms/token    | 0.003406         |
-| ONNX Runtime  | Tiny     | 0.746 ms/token    | 0.000746         |
-| PyTorch       | Medium   | 17.837 ms/token   | 0.017387         |
-| PyTorch 2.0   | Medium   | 18.124 ms/token   | 0.018124         |
-| ONNX Runtime  | Medium   | 3.894 ms/token    | 0.003894         |
-| PyTorch       | Large v2 | 23.470 ms/token   | 0.023470         |
-| PyTorch 2.0   | Large v2 | 23.146 ms/token   | 0.023146         |
-| ONNX Runtime  | Large v2 | 6.262 ms/token    | 0.006262         |
+| Engine          | Size     | Per-Token Latency | Real-Time Factor |
+| --------------- | -------- | ----------------- | ---------------- |
+| PyTorch eager   | Tiny     | 4.697 ms/token    | 0.004697         |
+| PyTorch compile | Tiny     | 3.406 ms/token    | 0.003406         |
+| ONNX Runtime    | Tiny     | 0.746 ms/token    | 0.000746         |
+| PyTorch eager   | Medium   | 17.837 ms/token   | 0.017387         |
+| PyTorch compile | Medium   | 18.124 ms/token   | 0.018124         |
+| ONNX Runtime    | Medium   | 3.894 ms/token    | 0.003894         |
+| PyTorch eager   | Large v2 | 23.470 ms/token   | 0.023470         |
+| PyTorch compile | Large v2 | 23.146 ms/token   | 0.023146         |
+| ONNX Runtime    | Large v2 | 6.262 ms/token    | 0.006262         |
 
 #### FP32
 
-| Engine        | Size     | Per-Token Latency | Real-Time Factor |
-| ------------- | -------- | ----------------- | ---------------- |
-| PyTorch       | Tiny     | 6.220 ms/token    | 0.006220         |
-| PyTorch 2.0   | Tiny     | 3.944 ms/token    | 0.003944         |
-| ONNX Runtime  | Tiny     | 1.545 ms/token    | 0.001545         |
-| PyTorch       | Medium   | 19.093 ms/token   | 0.019093         |
-| PyTorch 2.0   | Medium   | 20.459 ms/token   | 0.020459         |
-| ONNX Runtime  | Medium   | 9.440 ms/token    | 0.009440         |
-| PyTorch       | Large v2 | 25.844 ms/token   | 0.025844         |
-| PyTorch 2.0   | Large v2 | 26.397 ms/token   | 0.026397         |
-| ONNX Runtime  | Large v2 | 7.492 ms/token    | 0.007492         |
+| Engine          | Size     | Per-Token Latency | Real-Time Factor |
+| --------------- | -------- | ----------------- | ---------------- |
+| PyTorch eager   | Tiny     | 6.220 ms/token    | 0.006220         |
+| PyTorch compile | Tiny     | 3.944 ms/token    | 0.003944         |
+| ONNX Runtime    | Tiny     | 1.545 ms/token    | 0.001545         |
+| PyTorch eager   | Medium   | 19.093 ms/token   | 0.019093         |
+| PyTorch compile | Medium   | 20.459 ms/token   | 0.020459         |
+| ONNX Runtime    | Medium   | 9.440 ms/token    | 0.009440         |
+| PyTorch eager   | Large v2 | 25.844 ms/token   | 0.025844         |
+| PyTorch compile | Large v2 | 26.397 ms/token   | 0.026397         |
+| ONNX Runtime    | Large v2 | 7.492 ms/token    | 0.007492         |
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
index 283528bea7465..759ae6d14f184 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py
@@ -24,7 +24,7 @@
 
 
 def get_inputs(args: argparse.Namespace):
-    if args.benchmark_type not in {"hf-pt", "hf-pt2", "hf-ort", "ort"}:
+    if args.benchmark_type not in {"hf-pt-eager", "hf-pt-compile", "hf-ort", "ort"}:
         raise Exception("Unable to auto-detect inputs for provided model")
 
     def load_via_ffmpeg():
@@ -102,7 +102,7 @@ def get_model(args: argparse.Namespace):
     # 2) Benchmark Whisper ONNX model from Optimum export (without pre/post processing)
     # 3) Benchmark Whisper ONNX E2E model from Olive (with pre/post processing)
 
-    if args.benchmark_type in {"hf-pt", "hf-pt2"}:
+    if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
         source = args.hf_pt_model_path if args.hf_pt_model_path else args.model_name
         start_time = time.time()
         model = AutoModelForSpeechSeq2Seq.from_pretrained(
@@ -112,7 +112,7 @@ def get_model(args: argparse.Namespace):
         ).to(args.target_device)
         end_time = time.time()
 
-        if args.benchmark_type == "hf-pt2":
+        if args.benchmark_type == "hf-pt-compile":
             model = torch.compile(model)
 
     elif args.benchmark_type in {"hf-ort", "ort"}:
@@ -136,7 +136,7 @@ def get_model(args: argparse.Namespace):
 
         start_time = time.time()
         model = ORTModelForSpeechSeq2Seq.from_pretrained(
-            args.hf_ort_model_path,
+            args.hf_ort_dir_path,
             use_io_binding=(args.device != "cpu"),
             provider=provider,
             provider_options=provider_options,
@@ -214,7 +214,7 @@ def profile_fn(args, fn, inputs, inputs_type):
     prefix = f"{args.benchmark_type.lower()}-{args.precision}-{args.device}_{fn.__name__.replace('_', '-')}_{inputs_type}_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}"
     filename = None
 
-    if args.benchmark_type in {"hf-pt", "hf-pt2"}:
+    if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
         # Profile PyTorch kernels
         with profile(  # noqa: SIM117
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=True
@@ -280,7 +280,7 @@ def gen_and_dec(inputs):
 
     generate_fn = gen_and_dec
 
-    if args.benchmark_type == "hf-pt2":
+    if args.benchmark_type == "hf-pt-compile":
         # Run forward pass once with each set of inputs to process through Dynamo
         generate_fn(inputs)
 
@@ -345,7 +345,7 @@ def prepare_ort_inputs(inputs, warmup=False):
             for k, v in inputs.items():
                 io_binding.bind_cpu_input(k, v)
             for output in model.get_outputs():
-                io_binding.bind_output(output.name)
+                io_binding.bind_output(output.name, device_type=args.device, device_id=args.device_id)
             return io_binding
 
         return inputs
@@ -407,7 +407,7 @@ def handle_output(output):
 
 
 def run_inference(args, inputs, model):
-    if args.benchmark_type in {"hf-pt", "hf-pt2", "hf-ort"}:
+    if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile", "hf-ort"}:
         run_hf_inference(args, inputs, model)
     elif args.benchmark_type == "ort":
         run_ort_inference(args, inputs, model)
@@ -419,8 +419,13 @@ def parse_args():
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        "-bt", "--benchmark-type", type=str, required=True, choices=["hf-pt", "hf-pt2", "hf-ort", "ort"]
+        "-bt",
+        "--benchmark-type",
+        type=str,
+        required=True,
+        choices=["hf-pt-eager", "hf-pt-compile", "hf-ort", "ort"],
     )
+
     parser.add_argument(
         "-m",
         "--model-name",
@@ -445,7 +450,7 @@ def parse_args():
         help="Path to directory containing all PyTorch files (e.g. tokenizer, PyTorch model)",
     )
     parser.add_argument(
-        "--hf-ort-model-path",
+        "--hf-ort-dir-path",
         type=str,
         default="",
         help="Path to directory containing all ONNX files (e.g. tokenizer, encoder, decoder, decoder_with_past)",
@@ -538,7 +543,7 @@ def parse_args():
 
     # Check that model paths have been specified for any benchmarking with ORT
     if args.benchmark_type == "hf-ort":
-        assert args.hf_ort_model_path, "Please specify a path to `--hf-ort-model-path`"
+        assert args.hf_ort_dir_path, "Please specify a path to `--hf-ort-dir-path`"
     if args.benchmark_type == "ort":
         assert args.ort_model_path, "Please specify a path to `--ort-model-path`"
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
index 08d7befec3cfd..071b539ac1899 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
@@ -54,7 +54,21 @@ def get_args():
     )
 
     parser.add_argument(
-        "--hf-ort-model-path",
+        "--hf-pt-eager",
+        default=False,
+        action="store_true",
+        help="Benchmark in PyTorch without `torch.compile`",
+    )
+
+    parser.add_argument(
+        "--hf-pt-compile",
+        default=False,
+        action="store_true",
+        help="Benchmark in PyTorch with `torch.compile`",
+    )
+
+    parser.add_argument(
+        "--hf-ort-dir-path",
         type=str,
         help="Path to folder containing ONNX models for Optimum + ORT benchmarking",
     )
@@ -136,7 +150,7 @@ def process_log_file(device_id, log_file, base_results):
 
     load_audio_latency_s, load_audio_throughput_s = None, None
     feat_ext_latency_s, feat_ext_throughput_s = None, None
-    latency_s, per_token_latency_s, per_token_latency_ms = None, None, None
+    token_length, latency_s, per_token_latency_s, per_token_latency_ms = None, None, None, None
     throughput, memory = None, None
 
     # Detect metrics
@@ -310,73 +324,75 @@ def main():
         logger.info(f"Testing {audio_path}...")
 
         # Benchmark PyTorch without torch.compile
-        benchmark_cmd = [  # noqa: RUF005
-            "python3",
-            "-m",
-            "models.whisper.benchmark",
-            "--audio-path",
-            audio_path,
-            "--benchmark-type",
-            "hf-pt",
-            "--model-name",
-            args.model_name,
-            "--precision",
-            args.precision,
-            "--device",
-            args.device,
-            "--device-id",
-            str(args.device_id),
-            "--warmup-runs",
-            str(args.warmup_runs),
-            "--num-runs",
-            str(args.num_runs),
-            "--log-folder",
-            args.log_folder,
-        ] + hf_decoder_input_ids_cmd
-        logger.info("Benchmark PyTorch without torch.compile")
-        results = benchmark(args, benchmark_cmd, "pytorch", audio_file, duration)
-        all_results.extend(results)
+        if args.hf_pt_eager:
+            benchmark_cmd = [  # noqa: RUF005
+                "python",
+                "-m",
+                "models.whisper.benchmark",
+                "--audio-path",
+                audio_path,
+                "--benchmark-type",
+                "hf-pt-eager",
+                "--model-name",
+                args.model_name,
+                "--precision",
+                args.precision,
+                "--device",
+                args.device,
+                "--device-id",
+                str(args.device_id),
+                "--warmup-runs",
+                str(args.warmup_runs),
+                "--num-runs",
+                str(args.num_runs),
+                "--log-folder",
+                args.log_folder,
+            ] + hf_decoder_input_ids_cmd
+            logger.info("Benchmark PyTorch without torch.compile")
+            results = benchmark(args, benchmark_cmd, "pytorch-eager", audio_file, duration)
+            all_results.extend(results)
 
         # Benchmark PyTorch with torch.compile
-        benchmark_cmd = [  # noqa: RUF005
-            "python3",
-            "-m",
-            "models.whisper.benchmark",
-            "--audio-path",
-            audio_path,
-            "--benchmark-type",
-            "hf-pt2",
-            "--model-name",
-            args.model_name,
-            "--precision",
-            args.precision,
-            "--device",
-            args.device,
-            "--device-id",
-            str(args.device_id),
-            "--warmup-runs",
-            str(args.warmup_runs),
-            "--num-runs",
-            str(args.num_runs),
-            "--log-folder",
-            args.log_folder,
-        ] + hf_decoder_input_ids_cmd
-        logger.info("Benchmark PyTorch with torch.compile")
-        results = benchmark(args, benchmark_cmd, "pytorch-2", audio_file, duration)
-        all_results.extend(results)
+        if args.hf_pt_compile:
+            benchmark_cmd = [  # noqa: RUF005
+                "python",
+                "-m",
+                "models.whisper.benchmark",
+                "--audio-path",
+                audio_path,
+                "--benchmark-type",
+                "hf-pt-compile",
+                "--model-name",
+                args.model_name,
+                "--precision",
+                args.precision,
+                "--device",
+                args.device,
+                "--device-id",
+                str(args.device_id),
+                "--warmup-runs",
+                str(args.warmup_runs),
+                "--num-runs",
+                str(args.num_runs),
+                "--log-folder",
+                args.log_folder,
+            ] + hf_decoder_input_ids_cmd
+            logger.info("Benchmark PyTorch with torch.compile")
+            results = benchmark(args, benchmark_cmd, "pytorch-compile", audio_file, duration)
+            all_results.extend(results)
 
         # Benchmark Optimum + ONNX Runtime
-        if args.hf_ort_model_path:
+        if args.hf_ort_dir_path:
             benchmark_cmd = [  # noqa: RUF005
-                "python3",
+                "python",
                 "-m",
                 "models.whisper.benchmark",
                 "--audio-path",
                 audio_path,
                 "--benchmark-type",
                 "hf-ort",
-                "--hf-ort-model-path",
-                args.hf_ort_model_path,
+                "--hf-ort-dir-path",
+                args.hf_ort_dir_path,
                 "--model-name",
                 args.model_name,
                 "--precision",
@@ -393,14 +409,14 @@ def main():
                 args.log_folder,
             ] + hf_decoder_input_ids_cmd
             logger.info("Benchmark Optimum + ONNX Runtime")
-            results = benchmark(args, benchmark_cmd, "pytorch-ort", audio_file, duration)
+            results = benchmark(args, benchmark_cmd, "optimum-ort", audio_file, duration)
             all_results.extend(results)
 
         # Benchmark ONNX Runtime
         if args.ort_model_path:
             benchmark_cmd = (
                 [  # noqa: RUF005
-                    "python3",
+                    "python",
                     "-m",
                     "models.whisper.benchmark",
                     "--audio-path",
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert.py b/onnxruntime/python/tools/transformers/onnx_model_bert.py
index 995f8c6541b4c..7a69922e67072 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert.py
@@ -22,7 +22,9 @@
 from fusion_qordered_layernorm import FusionQOrderedLayerNormalization
 from fusion_qordered_matmul import FusionQOrderedMatMul
 from fusion_reshape import FusionReshape
+from fusion_rotary_attention import FusionRotaryEmbeddings
 from fusion_shape import FusionShape
+from fusion_simplified_layernorm import FusionSimplifiedLayerNormalization, FusionSkipSimplifiedLayerNormalization
 from fusion_skiplayernorm import FusionBiasSkipLayerNormalization, FusionSkipLayerNormalization
 from fusion_utils import FusionUtils
 from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
@@ -106,10 +108,36 @@ def fuse_layer_norm(self):
         fusion = FusionQOrderedLayerNormalization(self)
         fusion.apply()
 
+    def fuse_simplified_layer_norm(self):
+        fusion = FusionSimplifiedLayerNormalization(self)
+        fusion.apply()
+
     def fuse_skip_layer_norm(self):
         fusion = FusionSkipLayerNormalization(self)
         fusion.apply()
 
+    def fuse_skip_simplified_layer_norm(self):
+        fusion = FusionSkipSimplifiedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_rotary_embeddings(self):
+        fusion = FusionRotaryEmbeddings(self)
+        fusion.apply()
+        # Remove non-MS domain functions
+        rot_emb_nodes = list(
+            filter(
+                lambda node: node.op_type == "RotaryEmbedding" and node.domain != "com.microsoft", self.model.graph.node
+            )
+        )
+        non_ms_domains_to_keep = set(map(lambda node: node.domain, rot_emb_nodes))
+        i = 0
+        while i < len(self.model.functions):
+            fn = self.model.functions[i]
+            if "RotaryEmbedding" in fn.name and fn.domain not in non_ms_domains_to_keep:
+                self.model.functions.remove(fn)
+            else:
+                i += 1
+
     # Only relevant in models with Q-DQ nodes
     def fuse_qordered_mamtul(self):
         fusion = FusionQOrderedMatMul(self)
@@ -367,6 +395,7 @@ def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bo
 
         if (options is None) or options.enable_layer_norm:
             self.fuse_layer_norm()
+            self.fuse_simplified_layer_norm()
 
         if (options is None) or options.enable_gelu:
             self.fuse_gelu()
@@ -377,6 +406,10 @@ def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bo
 
         if (options is None) or options.enable_skip_layer_norm:
             self.fuse_skip_layer_norm()
+            self.fuse_skip_simplified_layer_norm()
+
+        if (options is None) or options.enable_rotary_embeddings:
+            self.fuse_rotary_embeddings()
 
         if options is not None:
             self.attention_mask.set_mask_format(options.attention_mask_format)
@@ -442,14 +475,17 @@ def get_fused_operator_statistics(self):
             "BiasGelu",
             "GemmFastGelu",
             "LayerNormalization",
+            "SimplifiedLayerNormalization",
             "SkipLayerNormalization",
+            "SkipSimplifiedLayerNormalization",
+            "RotaryEmbedding",
         ]
         q_ops = ["QOrderedAttention", "QOrderedGelu", "QOrderedLayerNormalization", "QOrderedMatMul"]
         for op in ops + q_ops:
             nodes = self.get_nodes_by_op_type(op)
             op_count[op] = len(nodes)
 
-        logger.info(f"Optimized operators:{op_count}")
+        logger.info(f"Optimized operators: {op_count}")
         return op_count
 
     def is_fully_optimized(self):
@@ -461,11 +497,20 @@ def is_fully_optimized(self):
         attention = op_count["Attention"] + op_count["MultiHeadAttention"] + op_count["QOrderedAttention"]
         gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
         layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
-        is_perfect = (embed > 0) and (attention > 0) and (attention == gelu) and (layer_norm >= 2 * attention)
+        simple_layer_norm = op_count["SimplifiedLayerNormalization"] + op_count["SkipSimplifiedLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and ((layer_norm >= 2 * attention) or (simple_layer_norm >= 2 * attention))
+        )
 
         if layer_norm == 0:
             logger.debug("Layer Normalization not fused")
 
+        if simple_layer_norm == 0:
+            logger.debug("Simple Layer Normalization not fused")
+
         if gelu == 0:
             logger.debug("Gelu/FastGelu not fused")
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model_gpt2.py b/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
index 263857ffbc130..6545bb08cdd5e 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
@@ -8,6 +8,7 @@
 from fusion_gpt_attention import FusionGptAttention
 from fusion_gpt_attention_megatron import FusionGptAttentionMegatron
 from fusion_gpt_attention_no_past import FusionGptAttentionNoPast
+from fusion_rotary_attention import FusionRotaryAttention
 from onnx_model_bert import BertOnnxModel
 
 logger = logging.getLogger(__name__)
@@ -27,6 +28,9 @@ def fuse_attention(self):
             fusion = FusionGptAttentionMegatron(self, self.num_heads)
             fusion.apply()
 
+        fusion = FusionRotaryAttention(self, self.hidden_size, self.num_heads)
+        fusion.apply()
+
     def postprocess(self):
         """
         Remove extra reshape nodes.
@@ -94,4 +98,4 @@ def postprocess(self):
             reshape_count += 2
 
         self.prune_graph()
-        logger.info(f"postprocess: remove Reshape count:{reshape_count}")
+        logger.info(f"postprocess: remove Reshape count: {reshape_count}")
diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py
index e9f98e956b760..95f40af3fd746 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_t5.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_t5.py
@@ -3,12 +3,12 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import logging
-from typing import Dict, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 from fusion_attention import AttentionMask, FusionAttention
 from fusion_base import Fusion
-from fusion_skiplayernorm import FusionSkipLayerNormalization
+from fusion_simplified_layernorm import FusionSimplifiedLayerNormalization, FusionSkipSimplifiedLayerNormalization
 from fusion_utils import NumpyHelper
 from onnx import NodeProto, TensorProto, helper
 from onnx_model import OnnxModel
@@ -56,8 +56,8 @@ def create_attention_node(
         Args:
             mask_index (str): mask input
             q_matmul (NodeProto): MatMul node in fully connection for Q
-            k_matmul (NodeProto): MatMul node in fully connection for  K
-            v_matmul (NodeProto): MatMul node in fully connection for  V
+            k_matmul (NodeProto): MatMul node in fully connection for K
+            v_matmul (NodeProto): MatMul node in fully connection for V
             num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
             hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
             input (str): input name
@@ -687,67 +687,6 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
         self.node_name_to_graph_name[rpb_node.name] = self.this_graph_name
 
 
-class FusionSimplifiedLayerNormalization(Fusion):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "SimplifiedLayerNormalization", "Mul")
-
-    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
-        if node.op_type != "Mul":
-            return
-
-        sim_ln_nodes = self.model.match_parent_path(
-            node,
-            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
-            [1, 1, 1, 0, 0, 0, 0],
-        )
-        if sim_ln_nodes is None:
-            sim_ln_nodes = self.model.match_parent_path(
-                node,
-                ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"],
-                [1, 1, 1, 0, 0, 0, 0],
-            )
-            if sim_ln_nodes is None:
-                return
-
-        pow_node = sim_ln_nodes[-2]
-        if self.model.find_constant_input(pow_node, 2.0) != 1:
-            return
-
-        root_input = pow_node.input[0]
-
-        mul_node_1 = sim_ln_nodes[0]
-        if root_input != mul_node_1.input[0]:
-            return
-
-        second_add_node = sim_ln_nodes[3]
-        i, add_weight = self.model.get_constant_input(second_add_node)
-        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
-            logger.warning(f"epsilon value is not expeced: {add_weight}")
-            return
-
-        self.nodes_to_remove.extend(sim_ln_nodes[:-1])
-
-        normalize_node = helper.make_node(
-            "SimplifiedLayerNormalization",
-            inputs=[root_input, node.input[0]],
-            outputs=[node.output[0]],
-            name=self.model.create_node_name("SimplifiedLayerNormalization", name_prefix="LayerNorm"),
-        )
-        normalize_node.attribute.extend([helper.make_attribute("epsilon", float(add_weight))])
-        normalize_node.attribute.extend([helper.make_attribute("axis", int(-1))])
-        normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)])
-        self.nodes_to_add.append(normalize_node)
-        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
-
-
-class FusionSkipSimplifiedLayerNormalization(FusionSkipLayerNormalization):
-    def __init__(self, model: OnnxModel):
-        super().__init__(model, "SkipSimplifiedLayerNormalization", "SimplifiedLayerNormalization")
-
-    def fuse(self, node, input_name_to_nodes, output_name_to_node):
-        super().fuse(node, input_name_to_nodes, output_name_to_node)
-
-
 class T5OnnxModel(BertOnnxModel):
     def __init__(self, model, num_heads, hidden_size):
         super().__init__(model, num_heads, hidden_size)
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 5ded027b36f74..00b26c019d4b5 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -103,7 +103,7 @@ def optimize_by_onnxruntime(
         logger.error("There is no gpu for onnxruntime to do optimization.")
         return onnx_model_path
 
-    model = OnnxModel(load_model(onnx_model_path, format=None, load_external_data=False))
+    model = OnnxModel(load_model(onnx_model_path, load_external_data=False))
     if model.use_float16() and not use_gpu:
         logger.warning(
             "This model uses float16 in the graph, use_gpu=False might cause extra Cast nodes. "
@@ -546,7 +546,7 @@ def main():
     if args.input_int32:
         optimizer.change_graph_inputs_to_int32()
 
-    if args.model_type in ["bert", "gpt2"]:
+    if args.model_type in set(MODEL_TYPES.keys()):
         if optimizer.is_fully_optimized():
             logger.info("The model has been fully optimized.")
         else:
diff --git a/onnxruntime/python/tools/transformers/shape_infer_helper.py b/onnxruntime/python/tools/transformers/shape_infer_helper.py
index f8a5464d8af78..f1fc0c952e8e4 100644
--- a/onnxruntime/python/tools/transformers/shape_infer_helper.py
+++ b/onnxruntime/python/tools/transformers/shape_infer_helper.py
@@ -28,12 +28,12 @@ def __init__(self, model, verbose=0, int_max=2**31 - 1, auto_merge=True, guess_o
         self.is_inferred_: bool = False
         self.dynamic_axis_mapping_: Dict[str, int] = {}
 
-    def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 128):
+    def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 200):
         """Run shape inference, and try replace dynamic axis from string to integer when mapping is provided.
 
         Args:
             dynamic_axis_mapping (_type_): a dictionary with name of dynamic axis as key, like {"batch_size" : 4}
-            max_runs (int, optional): limit maximum number of runs to avoid infinite loop. Defaults to 32.
+            max_runs (int, optional): limit maximum number of runs to avoid infinite loop. Defaults to 200.
 
         Returns:
             bool: whether all shapes has been inferred or not.
diff --git a/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc b/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc
new file mode 100644
index 0000000000000..e739b17d5885f
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc
@@ -0,0 +1,151 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef ORT_MINIMAL_BUILD
+
+#include "core/common/span_utils.h"
+#include "core/framework/tensor.h"
+#include "core/mlas/inc/mlas_q4.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/session/inference_session.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "test/framework/test_utils.h"
+#include "test/optimizer/graph_transform_test_builder.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/util/include/default_providers.h"
+#include "core/util/qmath.h"
+#include "contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h"
+
+#include <chrono>
+#include <random>
+
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
+namespace onnxruntime {
+namespace test {
+
+void QuantizeDequantizeBnb4(std::vector<float>& raw_vals,  // N X K
+                            std::vector<uint8_t>& quant_vals,
+                            std::vector<float>& absmax,
+                            int32_t quant_type,
+                            int32_t N,
+                            int32_t K,
+                            int32_t block_size) {
+  OrtThreadPoolParams to;
+  auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to,
+                                          concurrency::ThreadPoolType::INTRA_OP);
+
+  contrib::QuantizeBlockwiseBnb4<float>(
+      quant_vals.data(),
+      raw_vals.data(),
+      absmax.data(),
+      block_size,
+      quant_type,
+      N,
+      K,
+      tp.get());
+
+  contrib::DequantizeBlockwiseBnb4<float>(
+      raw_vals.data(),
+      quant_vals.data(),
+      absmax.data(),
+      block_size,
+      quant_type,
+      N,
+      K,
+      tp.get());
+}
+
+void RunTest(int64_t quant_type, int64_t M, int64_t N, int64_t K, int64_t block_size, bool use_float16) {
+  RandomValueGenerator random{1234};
+  std::vector<float> input0_vals(random.Gaussian<float>(std::vector<int64_t>({M, K}), 0.0f, 0.25f));
+  // quantizer expects transposed weights, N X K
+  std::vector<float> input1_f_vals(random.Gaussian<float>(std::vector<int64_t>({N, K}), 0.0f, 0.25f));
+
+  int64_t numel = N * K;
+  int64_t quantized_numel = (numel + 1) / 2;
+  int64_t total_block_count = (numel + block_size - 1) / block_size;
+  std::vector<uint8_t> input1_vals(quantized_numel);
+  std::vector<float> absmax(total_block_count);
+
+  QuantizeDequantizeBnb4(input1_f_vals,
+                         input1_vals,
+                         absmax,
+                         static_cast<int32_t>(quant_type),
+                         static_cast<int32_t>(N),
+                         static_cast<int32_t>(K),
+                         static_cast<int32_t>(block_size));
+
+  std::vector<float> expected_vals(M * N);
+  for (int64_t m = 0; m < M; m++) {
+    for (int64_t n = 0; n < N; n++) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; k++) {
+        sum += input0_vals[m * K + k] * input1_f_vals[n * K + k];
+      }
+      expected_vals[m * N + n] = sum;
+    }
+  }
+
+  OpTester test("MatMulBnb4", 1, kMSDomain);
+  test.AddAttribute<int64_t>("K", K);
+  test.AddAttribute<int64_t>("N", N);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddAttribute<int64_t>("quant_type", quant_type);
+  if (use_float16) {
+    test.AddInput<MLFloat16>("A", {M, K}, ToFloat16(input0_vals), false);
+    test.AddInput<uint8_t>("B", {quantized_numel}, input1_vals, true);
+    test.AddInput<MLFloat16>("absmax", {total_block_count}, ToFloat16(absmax), true);
+
+    test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(expected_vals));
+    test.SetOutputAbsErr("Y", 0.02f);
+
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  } else {
+    test.AddInput<float>("A", {M, K}, input0_vals, false);
+    test.AddInput<uint8_t>("B", {quantized_numel}, input1_vals, true);
+    test.AddInput<float>("absmax", {total_block_count}, absmax, true);
+
+    test.AddOutput<float>("Y", {M, N}, expected_vals);
+
+    test.Run();
+  }
+}
+
+TEST(MatMulBnb4, Float32) {
+  for (auto qt : {0, 1}) {
+    for (auto M : {1, 2, 100}) {
+      for (auto N : {1, 2, 32, 288}) {
+        for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
+          for (auto block_size : {16, 32, 64, 128}) {
+            RunTest(qt, M, N, K, block_size, false);
+          }
+        }
+      }
+    }
+  }
+}
+
+#if defined(USE_CUDA)
+TEST(MatMulBnb4, Float16) {
+  for (auto qt : {0, 1}) {
+    for (auto M : {1, 2, 100}) {
+      for (auto N : {1, 2, 32, 288}) {
+        for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
+          for (auto block_size : {16, 32, 64, 128}) {
+            RunTest(qt, M, N, K, block_size, true);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // ORT_MINIMAL_BUILD
diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
new file mode 100644
index 0000000000000..29d8219c162a5
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -0,0 +1,632 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cassert>
+#include "gtest/gtest.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+static void RunTest(
+    const std::vector<float>& input_data,
+    const std::vector<int64_t>& position_ids,
+    const std::vector<float>& cos_cache,
+    const std::vector<float>& sin_cache,
+    const std::vector<float>& output_data,
+    int batch_size,
+    int sequence_length,
+    int head_size,
+    int num_heads,
+    int max_sequence_length,
+    int64_t interleaved,
+    bool use_float16,
+    bool disable_cpu,
+    bool disable_cuda) {
+  //    input        : (batch_size, sequence_length, hidden_size)
+  //    position ids : (1) or (batch_size, sequence_length)
+  //    cos cache    : (max_sequence_length, head_size / 2)
+  //    sin cache    : (max_sequence_length, head_size / 2)
+  //    interleaved  : 0 = false, 1 = true
+
+  int hidden_size = num_heads * head_size;
+  std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
+  std::vector<int64_t> pos_dims;
+  std::vector<int64_t> cache_dims = {max_sequence_length, head_size / 2};
+
+  assert(hidden_size != 0 && head_size != 0 && num_heads != 0 && max_sequence_length != 0);
+  assert(max_sequence_length >= sequence_length);
+  if (position_ids.size() == 1) {
+    pos_dims = {1};
+  } else {
+    pos_dims = {batch_size, sequence_length};
+  }
+
+  std::string op_type = "RotaryEmbedding";
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+
+  int min_cuda_architecture = use_float16 ? 530 : 0;
+  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+  if (enable_cuda && !disable_cuda) {
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+  }
+  if (!use_float16 && !disable_cpu) {
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+  }
+  if (execution_providers.size() == 0) {
+    // Return early if CI pipeline does not support EP (e.g. CUDA EP for CPU CI pipeline)
+    return;
+  }
+
+  OpTester test(op_type.c_str(), 1, onnxruntime::kMSDomain);
+  test.AddAttribute<int64_t>("interleaved", interleaved);
+
+  if (!use_float16) {
+    test.AddInput<float>("input", input_dims, input_data);
+    test.AddInput<int64_t>("position_ids", pos_dims, position_ids);
+    test.AddInput<float>("cos_cache", cache_dims, cos_cache);
+    test.AddInput<float>("sin_cache", cache_dims, sin_cache);
+    test.AddOutput<float>("output", input_dims, output_data);
+  } else {
+    test.AddInput<MLFloat16>("input", input_dims, ToFloat16(input_data));
+    test.AddInput<int64_t>("position_ids", pos_dims, position_ids);
+    test.AddInput<MLFloat16>("cos_cache", cache_dims, ToFloat16(cos_cache));
+    test.AddInput<MLFloat16>("sin_cache", cache_dims, ToFloat16(sin_cache));
+    test.AddOutput<MLFloat16>("output", input_dims, ToFloat16(output_data));
+  }
+  test.SetOutputAbsErr("output", 0.002f);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+static void RunTests(const std::vector<float>& input_data,
+                     const std::vector<int64_t>& position_ids,
+                     const std::vector<float>& cos_cache,
+                     const std::vector<float>& sin_cache,
+                     const std::vector<float>& output_data,
+                     int batch_size,
+                     int sequence_length,
+                     int head_size = 0,
+                     int num_heads = 0,
+                     int max_sequence_length = 0,
+                     int64_t interleaved = 0,
+                     bool use_float16 = true) {
+  // FP32 test for CPU
+  RunTest(input_data,
+          position_ids,
+          cos_cache,
+          sin_cache,
+          output_data,
+          batch_size,
+          sequence_length,
+          head_size,
+          num_heads,
+          max_sequence_length,
+          interleaved,
+          false, /* use_fp16 */
+          false, /* disable_cpu */
+          true /* disable_cuda */);
+
+  // FP32 test for CUDA
+  RunTest(input_data,
+          position_ids,
+          cos_cache,
+          sin_cache,
+          output_data,
+          batch_size,
+          sequence_length,
+          head_size,
+          num_heads,
+          max_sequence_length,
+          interleaved,
+          false, /* use_fp16 */
+          false, /* disable_cpu */
+          false /* disable_cuda */);
+
+  // FP16 test for CUDA
+  if (use_float16) {
+    RunTest(input_data,
+            position_ids,
+            cos_cache,
+            sin_cache,
+            output_data,
+            batch_size,
+            sequence_length,
+            head_size,
+            num_heads,
+            max_sequence_length,
+            interleaved,
+            true, /* use_fp16 */
+            true, /* disable_cpu */
+            false /* disable_cuda*/);
+  }
+}
+
+// Interleaved = true, pos ids shape = (1)
+TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_SmallData_LlamaMSFT) {
+  int batch_size = 1;
+  int sequence_length = 3;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int64_t interleaved = 1;  // true
+
+  std::vector<float> input_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -0.1320f, -0.2751f, -0.2350f, 0.0937f,
+      -1.2188f, 1.1676f, -1.0574f, -0.1188f, -0.7396f, -1.2425f, -0.1752f, 0.6990f,
+      -0.8110f, 0.6737f, -1.1233f, -0.0919f, -0.6861f, 0.7202f, 0.1963f, 0.6142f};
+
+  std::vector<int64_t> position_ids = {0};
+
+  std::vector<float> cos_cache = {
+      1.0000f, 1.0000f, 0.5403f, 0.9999f, -0.4161f, 0.9998f, -0.9900f, 0.9996f,
+      -0.6536f, 0.9992f, 0.2837f, 0.9988f, 0.9602f, 0.9982f, 0.7539f, 0.9976f};
+
+  std::vector<float> sin_cache = {
+      0.0000f, 0.0000f, 0.8415f, 0.0100f, 0.9093f, 0.0200f, 0.1411f, 0.0300f,
+      -0.7568f, 0.0400f, -0.9589f, 0.0500f, -0.2794f, 0.0600f, 0.6570f, 0.0699f};
+
+  std::vector<float> output_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -0.1320f, -0.2751f, -0.2350f, 0.0937f,
+      -1.6411f, -0.3948f, -1.0561f, -0.1294f, 0.6460f, -1.2937f, -0.1822f, 0.6972f,
+      -0.2751f, -1.0178f, -1.1212f, -0.1143f, -0.3694f, -0.9235f, 0.1840f, 0.6180f};
+
+  RunTests(input_data,
+           position_ids,
+           cos_cache,
+           sin_cache,
+           output_data,
+           batch_size,
+           sequence_length,
+           head_size,
+           num_heads,
+           max_sequence_length,
+           interleaved);
+}
+
+// Interleaved = true, pos ids shape = (1)
+TEST(RotaryEmbeddingTest, RotaryEmbedding_Interleaved_LargeData_LlamaMSFT) {
+  int batch_size = 2;
+  int sequence_length = 8;
+  int num_heads = 4;
+  int head_size = 6;
+  int max_sequence_length = 16;
+  int64_t interleaved = 1;  // true
+
+  std::vector<float> input_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f,
+      1.1676f, -1.0190f, 0.3157f, -1.6036f, 1.8493f,
+      0.0447f, 1.5853f, 0.1036f, -0.3514f, 0.2421f,
+      0.6463f, 0.8730f, -0.9276f, 1.0311f, -1.9557f,
+      -0.1482f, 1.7376f, 2.2039f, -0.6589f, -1.0574f,
+      -0.1188f, -0.9078f, 0.3452f, -0.5713f, -0.2351f,
+      -0.5912f, 1.1312f, 0.7562f, -1.2023f, -0.5833f,
+      -0.4407f, 0.1766f, 1.0224f, -0.4826f, -0.5421f,
+      -0.5342f, -0.6413f, 1.3314f, -0.4498f, 0.5493f,
+      0.0539f, 0.2601f, 0.8570f, 1.0076f, -0.7529f,
+      -0.2250f, -0.4327f, -1.5071f, -0.4586f, -1.9791f,
+      0.7787f, -0.7749f, -0.1398f, 1.1414f, -0.6354f,
+      0.0352f, -0.4765f, -0.0409f, 1.1993f, 0.5374f,
+      -0.1930f, 2.5211f, -0.0452f, -0.3105f, -0.9407f,
+      -0.0034f, 1.5199f, -0.8480f, 0.5266f, 0.0299f,
+      -0.0498f, 1.0651f, 0.8860f, -1.4702f, -0.2134f,
+      -0.8707f, 1.6159f, -0.2356f, 0.9444f, 0.5937f,
+      0.7203f, 0.5061f, 1.5192f, -0.4897f, 0.9231f,
+      0.2654f, -0.1441f, 0.5407f, -1.5476f, 0.6455f,
+      -1.1382f, 0.4640f, -0.4986f, 0.1289f, 2.7631f,
+      0.1405f, 1.1191f, 2.1134f, -0.9754f, 0.1757f,
+      -0.1319f, -0.2735f, 0.3355f, -0.6008f, -1.1164f,
+      0.2577f, -0.7226f, -0.9244f, 1.8737f, 0.6052f,
+      1.1904f, 1.2195f, -0.0470f, -1.0914f, 1.0223f,
+      0.3152f, 1.7528f, -0.7650f, 1.8299f, -0.2784f,
+      -0.2719f, 0.1885f, 2.1432f, 0.8527f, 0.0965f,
+      -0.0625f, 0.8269f, 1.0122f, -1.4482f, -0.0644f,
+      0.3215f, 0.5908f, -1.4197f, 0.2113f, 0.0306f,
+      0.3604f, 0.3166f, -0.8975f, -0.6393f, -1.2944f,
+      -0.0243f, -0.2354f, -0.7087f, 1.1566f, 0.4296f,
+      0.5599f, -0.7776f, 0.3339f, 0.1759f, 2.1108f,
+      1.0702f, 0.8279f, -0.2969f, 0.7120f, -0.2068f,
+      -0.1548f, 0.1553f, 0.6207f, -0.1690f, -0.5816f,
+      1.2632f, 0.0695f, 1.1862f, -1.1874f, -0.7468f,
+      -0.9320f, -0.8579f, -0.9647f, -0.0991f, 0.0195f,
+      1.1213f, -1.4873f, -0.2043f, -1.0466f, -1.5772f,
+      -0.0489f, 0.3430f, 0.1264f, 0.1519f, -1.3639f,
+      -1.6593f, 1.8127f, -1.4459f, -0.2158f, -0.9792f,
+      -1.4392f, 0.6508f, 0.8964f, 0.5717f, -0.2390f,
+      0.6983f, -1.3416f, 0.2715f, -0.2852f, 0.6051f,
+      0.2167f, -0.2181f, -1.6306f, 1.4788f, 0.2754f,
+      -0.0261f, -0.4618f, -0.5646f, -1.0389f, 0.5819f,
+      1.3697f, 0.0002f, 1.5333f, -1.0556f, -0.1254f,
+      0.1527f, -0.5996f, -1.0962f, 1.6327f, 1.3951f,
+      0.8784f, 0.3389f, 1.2907f, 0.3124f, 0.7299f,
+      1.4220f, 0.3375f, 0.0438f, 1.8698f, -0.2635f,
+      -2.0799f, -0.6313f, 0.4090f, -1.1458f, 0.0784f,
+      -1.8848f, -1.6165f, 0.6179f, 0.9905f, -0.0729f,
+      0.5054f, -0.6681f, -1.4382f, 1.7547f, -0.9605f,
+      -0.4558f, -1.6105f, 0.2979f, 1.1537f, -1.5604f,
+      1.2779f, -1.2514f, 0.6056f, 0.5763f, -3.3558f,
+      0.2836f, 0.6909f, -0.7631f, 2.4451f, -0.3500f,
+      1.3289f, -0.6494f, 0.3478f, 1.0038f, -0.2937f,
+      0.9238f, -1.2185f, 0.4138f, 0.5033f, 0.9174f,
+      1.8131f, 1.4436f, -0.4207f, 0.0220f, -0.6807f,
+      -1.3306f, 1.5646f, 0.3338f, 0.7105f, 0.4683f,
+      -0.6179f, 0.0818f, -0.0488f, -0.9810f, -1.3632f,
+      0.0929f, -1.7926f, -0.2921f, -0.4792f, 0.6756f,
+      -0.3413f, -0.2242f, -0.2111f, 0.6282f, 0.1667f,
+      -1.4055f, 1.5895f, 1.0838f, -0.9077f, -0.8060f,
+      0.7967f, -2.9351f, 2.4179f, -0.4026f, 0.6451f,
+      1.6845f, -0.0901f, 0.6106f, 2.3603f, 1.3908f,
+      -0.7917f, -0.6734f, -0.1213f, -1.1116f, -0.7401f,
+      -0.7879f, 0.0606f, -2.3337f, -1.2603f, -1.7245f,
+      -0.3533f, -0.9421f, -0.1776f, 0.3992f, -1.7142f,
+      -0.5319f, -0.8848f, 0.6513f, 1.0002f, -1.4699f,
+      -1.4254f, 0.7013f, 0.2414f, 0.2551f, -0.7457f,
+      0.3133f, -1.0941f, -0.3682f, -0.0163f, -0.0645f,
+      -0.8101f, 0.1415f, 0.0551f, 0.5873f, -0.5887f,
+      -1.4733f, -0.8565f, 0.7400f, -0.5033f, 0.0553f,
+      0.9265f, -0.8652f, -0.0288f, -0.2209f, 0.0610f,
+      0.6776f, 0.4361f, -0.8052f, 0.3955f, 0.8988f,
+      0.8238f, 0.2262f, 1.2912f, 0.6488f, 1.2114f,
+      1.3569f, 0.2983f, 0.4718f, -1.1936f, 0.7928f,
+      -0.8665f, 0.9468f, 1.1629f, 0.0616f, -1.3136f,
+      -0.2764f, 0.0277f, -0.1126f, 0.2342f, -0.5866f,
+      -1.8219f, 1.1079f, 0.5795f, -1.4249f};
+
+  std::vector<int64_t> position_ids = {0};
+
+  std::vector<float> cos_cache = {
+      1.0000f, 1.0000f, 1.0000f, 0.5403f, 0.9989f, 1.0000f, -0.4161f, 0.9957f,
+      1.0000f, -0.9900f, 0.9903f, 1.0000f, -0.6536f, 0.9828f, 1.0000f, 0.2837f,
+      0.9732f, 0.9999f, 0.9602f, 0.9615f, 0.9999f, 0.7539f, 0.9477f, 0.9999f,
+      -0.1455f, 0.9318f, 0.9999f, -0.9111f, 0.9140f, 0.9998f, -0.8391f, 0.8942f,
+      0.9998f, 0.0044f, 0.8725f, 0.9997f, 0.8439f, 0.8488f, 0.9997f, 0.9074f,
+      0.8234f, 0.9996f, 0.1367f, 0.7962f, 0.9995f, -0.7597f, 0.7673f, 0.9995f};
+
+  std::vector<float> sin_cache = {
+      0.0000f, 0.0000f, 0.0000f, 0.8415f, 0.0464f, 0.0022f, 0.9093f, 0.0927f,
+      0.0043f, 0.1411f, 0.1388f, 0.0065f, -0.7568f, 0.1846f, 0.0086f, -0.9589f,
+      0.2300f, 0.0108f, -0.2794f, 0.2749f, 0.0129f, 0.6570f, 0.3192f, 0.0151f,
+      0.9894f, 0.3629f, 0.0172f, 0.4121f, 0.4057f, 0.0194f, -0.5440f, 0.4477f,
+      0.0215f, -1.0000f, 0.4887f, 0.0237f, -0.5366f, 0.5286f, 0.0259f, 0.4202f,
+      0.5675f, 0.0280f, 0.9906f, 0.6050f, 0.0302f, 0.6503f, 0.6413f, 0.0323f};
+
+  std::vector<float> output_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f,
+      1.1676f, -1.0190f, 0.3157f, -1.6036f, 1.8493f,
+      0.0447f, 1.5853f, 0.1036f, -0.3514f, 0.2421f,
+      0.6463f, 0.8730f, -0.9276f, 1.0311f, -1.9557f,
+      -0.1482f, 1.7376f, 2.2039f, -0.6589f, -0.4713f,
+      -0.9540f, -0.9229f, 0.3027f, -0.5708f, -0.2363f,
+      -1.2713f, 0.1137f, 0.8112f, -1.1659f, -0.5824f,
+      -0.4419f, -0.7649f, 0.7011f, -0.4569f, -0.5639f,
+      -0.5328f, -0.6424f, 1.0979f, 0.8773f, 0.5462f,
+      0.0793f, 0.2582f, 0.8576f, 0.2653f, 1.2295f,
+      -0.1839f, -0.4517f, -1.5052f, -0.4651f, 0.1155f,
+      -2.1237f, -0.7586f, -0.2110f, 1.1441f, -0.6304f,
+      0.4186f, 0.2303f, -0.1519f, 1.1903f, 0.5382f,
+      -0.1906f, -1.0080f, 2.3112f, -0.2220f, -0.9655f,
+      -0.0099f, 1.5198f, 0.7652f, -0.6410f, 0.0365f,
+      -0.0452f, 1.0593f, 0.8929f, 1.4856f, 0.0038f,
+      -1.0865f, 1.4794f, -0.2417f, 0.9428f, -0.6894f,
+      -0.6293f, 0.2904f, 1.5747f, -0.4956f, 0.9199f,
+      -0.2424f, 0.1801f, 0.7503f, -1.4576f, 0.6529f,
+      -1.1340f, -0.6807f, -0.0252f, -0.3834f, 2.7394f,
+      0.1308f, 1.1203f, -2.1196f, -0.9618f, 0.1970f,
+      -0.0972f, -0.2764f, 0.3332f, -0.4522f, 1.1844f,
+      0.3867f, -0.6626f, -0.9405f, 1.8656f, 0.5053f,
+      -1.2361f, 1.2072f, 0.1789f, -1.1002f, 1.0129f,
+      1.7702f, 0.1949f, -1.1653f, 1.6049f, -0.2755f,
+      -0.2749f, 2.1087f, 0.4272f, 0.8076f, 0.2900f,
+      -0.0714f, 0.8261f, -1.1016f, -1.3814f, -0.1366f,
+      0.2981f, 0.6060f, -1.4132f, 0.0893f, -0.1939f,
+      0.2779f, 0.3910f, -0.8906f, -0.6489f, -1.2496f,
+      0.3383f, -0.0315f, -0.7461f, 1.1510f, 0.4445f,
+      0.3203f, -0.9031f, 0.2727f, 0.2609f, 2.0968f,
+      1.0974f, 0.7120f, -0.5164f, 0.7415f, -0.0031f,
+      -0.1568f, 0.1533f, 0.5487f, -0.3357f, -0.9064f,
+      1.0546f, 0.0542f, 1.1870f, -0.4045f, -1.3431f,
+      -0.6094f, -1.1105f, -0.9631f, -0.1137f, -0.7219f,
+      0.8582f, -1.3443f, -0.6684f, -1.0227f, -1.5929f,
+      -0.2622f, 0.2264f, 0.0713f, 0.1843f, -1.3387f,
+      -1.6797f, 2.3165f, 0.1009f, 0.1081f, -0.9969f,
+      -1.4488f, 0.6291f, 0.8964f, 0.5717f, -0.2390f,
+      0.6983f, -1.3416f, 0.2715f, -0.2852f, 0.6051f,
+      0.2167f, -0.2181f, -1.6306f, 1.4788f, 0.2754f,
+      -0.0261f, -0.4618f, -0.5646f, -1.0389f, 0.5819f,
+      1.3697f, 0.0002f, 1.5333f, -1.0556f, -0.1254f,
+      0.1527f, 0.5985f, -1.0968f, 1.5662f, 1.4693f,
+      0.8776f, 0.3408f, 0.4345f, 1.2549f, 0.6631f,
+      1.4543f, 0.3374f, 0.0445f, 1.2320f, 1.4311f,
+      -2.0483f, -0.7272f, 0.4114f, -1.1449f, 1.6283f,
+      -0.9524f, -1.6435f, 0.5422f, 0.9907f, -0.0708f,
+      0.3972f, 0.7376f, -1.5947f, 1.6138f, -0.9586f,
+      -0.4600f, 0.3993f, -1.5884f, 1.2934f, -1.4467f,
+      1.2833f, -1.2459f, -0.7760f, 0.3108f, -3.3677f,
+      -0.0287f, 0.6942f, -0.7601f, -0.6993f, 2.3690f,
+      1.3834f, -0.5234f, 0.3435f, 1.0053f, 0.1604f,
+      -0.9560f, -1.2641f, 0.2406f, 0.4973f, 0.9206f,
+      -1.9987f, -1.1733f, -0.4197f, -0.0366f, -0.6720f,
+      -1.3350f, -1.5960f, -0.1097f, 0.6386f, 0.5624f,
+      -0.6184f, 0.0778f, 0.1867f, 0.9643f, -1.3629f,
+      -0.0972f, -1.7907f, -0.3037f, 0.8245f, -0.0789f,
+      -0.2940f, -0.2833f, -0.2165f, 0.6264f, -1.1726f,
+      0.7926f, 1.3621f, 1.3586f, -0.9007f, -0.8138f,
+      -2.7421f, 1.3155f, 2.4507f, 0.0507f, 0.6305f,
+      1.6900f, 0.5210f, -0.3309f, 2.0630f, 1.8026f,
+      -0.7859f, -0.6802f, -1.1003f, -0.1990f, -0.5391f,
+      -0.9370f, 0.0857f, -2.3330f, -2.0112f, 0.7193f,
+      -0.1272f, -0.9981f, -0.1818f, 0.3973f, -0.9963f,
+      1.4929f, -1.0109f, 0.4304f, 1.0160f, -1.4590f,
+      0.2682f, 1.5658f, 0.1762f, 0.3038f, -0.7491f,
+      0.3052f, -1.1534f, -0.0478f, 0.0021f, -0.0665f,
+      -0.8118f, 0.1310f, 0.2171f, 0.5485f, -0.1610f,
+      -1.5784f, -0.8660f, 0.7289f, -0.4678f, 0.1937f,
+      1.1287f, -0.5772f, -0.0259f, -0.2212f, 0.2479f,
+      0.6336f, 0.6407f, -0.6543f, 0.3838f, 0.9039f,
+      0.4724f, 0.7117f, 1.0165f, 1.0270f, 1.1908f,
+      1.3750f, -0.0850f, 0.5517f, -1.3842f, 0.3703f,
+      -0.8806f, 0.9336f, 0.8362f, 0.8105f, -1.1566f,
+      -0.6813f, 0.0294f, -0.1122f, 0.5620f, -0.2884f,
+      -2.0803f, 0.4684f, 0.6009f, -1.4160f};
+
+  RunTests(input_data,
+           position_ids,
+           cos_cache,
+           sin_cache,
+           output_data,
+           batch_size,
+           sequence_length,
+           head_size,
+           num_heads,
+           max_sequence_length,
+           interleaved);
+}
+
+// Interleaved = false, pos ids shape = (1)
+TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_LargeData_LlamaMSFT) {
+  int batch_size = 2;
+  int sequence_length = 8;
+  int num_heads = 4;
+  int head_size = 6;
+  int max_sequence_length = 16;
+  int64_t interleaved = 0;  // false
+
+  std::vector<float> input_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f,
+      1.1676f, -1.0190f, 0.3157f, -1.6036f, 1.8493f,
+      0.0447f, 1.5853f, 0.1036f, -0.3514f, 0.2421f,
+      0.6463f, 0.8730f, -0.9276f, 1.0311f, -1.9557f,
+      -0.1482f, 1.7376f, 2.2039f, -0.6589f, -1.0574f,
+      -0.1188f, -0.9078f, 0.3452f, -0.5713f, -0.2351f,
+      -0.5912f, 1.1312f, 0.7562f, -1.2023f, -0.5833f,
+      -0.4407f, 0.1766f, 1.0224f, -0.4826f, -0.5421f,
+      -0.5342f, -0.6413f, 1.3314f, -0.4498f, 0.5493f,
+      0.0539f, 0.2601f, 0.8570f, 1.0076f, -0.7529f,
+      -0.2250f, -0.4327f, -1.5071f, -0.4586f, -1.9791f,
+      0.7787f, -0.7749f, -0.1398f, 1.1414f, -0.6354f,
+      0.0352f, -0.4765f, -0.0409f, 1.1993f, 0.5374f,
+      -0.1930f, 2.5211f, -0.0452f, -0.3105f, -0.9407f,
+      -0.0034f, 1.5199f, -0.8480f, 0.5266f, 0.0299f,
+      -0.0498f, 1.0651f, 0.8860f, -1.4702f, -0.2134f,
+      -0.8707f, 1.6159f, -0.2356f, 0.9444f, 0.5937f,
+      0.7203f, 0.5061f, 1.5192f, -0.4897f, 0.9231f,
+      0.2654f, -0.1441f, 0.5407f, -1.5476f, 0.6455f,
+      -1.1382f, 0.4640f, -0.4986f, 0.1289f, 2.7631f,
+      0.1405f, 1.1191f, 2.1134f, -0.9754f, 0.1757f,
+      -0.1319f, -0.2735f, 0.3355f, -0.6008f, -1.1164f,
+      0.2577f, -0.7226f, -0.9244f, 1.8737f, 0.6052f,
+      1.1904f, 1.2195f, -0.0470f, -1.0914f, 1.0223f,
+      0.3152f, 1.7528f, -0.7650f, 1.8299f, -0.2784f,
+      -0.2719f, 0.1885f, 2.1432f, 0.8527f, 0.0965f,
+      -0.0625f, 0.8269f, 1.0122f, -1.4482f, -0.0644f,
+      0.3215f, 0.5908f, -1.4197f, 0.2113f, 0.0306f,
+      0.3604f, 0.3166f, -0.8975f, -0.6393f, -1.2944f,
+      -0.0243f, -0.2354f, -0.7087f, 1.1566f, 0.4296f,
+      0.5599f, -0.7776f, 0.3339f, 0.1759f, 2.1108f,
+      1.0702f, 0.8279f, -0.2969f, 0.7120f, -0.2068f,
+      -0.1548f, 0.1553f, 0.6207f, -0.1690f, -0.5816f,
+      1.2632f, 0.0695f, 1.1862f, -1.1874f, -0.7468f,
+      -0.9320f, -0.8579f, -0.9647f, -0.0991f, 0.0195f,
+      1.1213f, -1.4873f, -0.2043f, -1.0466f, -1.5772f,
+      -0.0489f, 0.3430f, 0.1264f, 0.1519f, -1.3639f,
+      -1.6593f, 1.8127f, -1.4459f, -0.2158f, -0.9792f,
+      -1.4392f, 0.6508f, 0.8964f, 0.5717f, -0.2390f,
+      0.6983f, -1.3416f, 0.2715f, -0.2852f, 0.6051f,
+      0.2167f, -0.2181f, -1.6306f, 1.4788f, 0.2754f,
+      -0.0261f, -0.4618f, -0.5646f, -1.0389f, 0.5819f,
+      1.3697f, 0.0002f, 1.5333f, -1.0556f, -0.1254f,
+      0.1527f, -0.5996f, -1.0962f, 1.6327f, 1.3951f,
+      0.8784f, 0.3389f, 1.2907f, 0.3124f, 0.7299f,
+      1.4220f, 0.3375f, 0.0438f, 1.8698f, -0.2635f,
+      -2.0799f, -0.6313f, 0.4090f, -1.1458f, 0.0784f,
+      -1.8848f, -1.6165f, 0.6179f, 0.9905f, -0.0729f,
+      0.5054f, -0.6681f, -1.4382f, 1.7547f, -0.9605f,
+      -0.4558f, -1.6105f, 0.2979f, 1.1537f, -1.5604f,
+      1.2779f, -1.2514f, 0.6056f, 0.5763f, -3.3558f,
+      0.2836f, 0.6909f, -0.7631f, 2.4451f, -0.3500f,
+      1.3289f, -0.6494f, 0.3478f, 1.0038f, -0.2937f,
+      0.9238f, -1.2185f, 0.4138f, 0.5033f, 0.9174f,
+      1.8131f, 1.4436f, -0.4207f, 0.0220f, -0.6807f,
+      -1.3306f, 1.5646f, 0.3338f, 0.7105f, 0.4683f,
+      -0.6179f, 0.0818f, -0.0488f, -0.9810f, -1.3632f,
+      0.0929f, -1.7926f, -0.2921f, -0.4792f, 0.6756f,
+      -0.3413f, -0.2242f, -0.2111f, 0.6282f, 0.1667f,
+      -1.4055f, 1.5895f, 1.0838f, -0.9077f, -0.8060f,
+      0.7967f, -2.9351f, 2.4179f, -0.4026f, 0.6451f,
+      1.6845f, -0.0901f, 0.6106f, 2.3603f, 1.3908f,
+      -0.7917f, -0.6734f, -0.1213f, -1.1116f, -0.7401f,
+      -0.7879f, 0.0606f, -2.3337f, -1.2603f, -1.7245f,
+      -0.3533f, -0.9421f, -0.1776f, 0.3992f, -1.7142f,
+      -0.5319f, -0.8848f, 0.6513f, 1.0002f, -1.4699f,
+      -1.4254f, 0.7013f, 0.2414f, 0.2551f, -0.7457f,
+      0.3133f, -1.0941f, -0.3682f, -0.0163f, -0.0645f,
+      -0.8101f, 0.1415f, 0.0551f, 0.5873f, -0.5887f,
+      -1.4733f, -0.8565f, 0.7400f, -0.5033f, 0.0553f,
+      0.9265f, -0.8652f, -0.0288f, -0.2209f, 0.0610f,
+      0.6776f, 0.4361f, -0.8052f, 0.3955f, 0.8988f,
+      0.8238f, 0.2262f, 1.2912f, 0.6488f, 1.2114f,
+      1.3569f, 0.2983f, 0.4718f, -1.1936f, 0.7928f,
+      -0.8665f, 0.9468f, 1.1629f, 0.0616f, -1.3136f,
+      -0.2764f, 0.0277f, -0.1126f, 0.2342f, -0.5866f,
+      -1.8219f, 1.1079f, 0.5795f, -1.4249f};
+
+  std::vector<int64_t> position_ids = {0};
+
+  std::vector<float> cos_cache = {
+      1.0000f, 1.0000f, 1.0000f, 0.5403f, 0.9989f, 1.0000f, -0.4161f, 0.9957f,
+      1.0000f, -0.9900f, 0.9903f, 1.0000f, -0.6536f, 0.9828f, 1.0000f, 0.2837f,
+      0.9732f, 0.9999f, 0.9602f, 0.9615f, 0.9999f, 0.7539f, 0.9477f, 0.9999f,
+      -0.1455f, 0.9318f, 0.9999f, -0.9111f, 0.9140f, 0.9998f, -0.8391f, 0.8942f,
+      0.9998f, 0.0044f, 0.8725f, 0.9997f, 0.8439f, 0.8488f, 0.9997f, 0.9074f,
+      0.8234f, 0.9996f, 0.1367f, 0.7962f, 0.9995f, -0.7597f, 0.7673f, 0.9995f};
+
+  std::vector<float> sin_cache = {
+      0.0000f, 0.0000f, 0.0000f, 0.8415f, 0.0464f, 0.0022f, 0.9093f, 0.0927f,
+      0.0043f, 0.1411f, 0.1388f, 0.0065f, -0.7568f, 0.1846f, 0.0086f, -0.9589f,
+      0.2300f, 0.0108f, -0.2794f, 0.2749f, 0.0129f, 0.6570f, 0.3192f, 0.0151f,
+      0.9894f, 0.3629f, 0.0172f, 0.4121f, 0.4057f, 0.0194f, -0.5440f, 0.4477f,
+      0.0215f, -1.0000f, 0.4887f, 0.0237f, -0.5366f, 0.5286f, 0.0259f, 0.4202f,
+      0.5675f, 0.0280f, 0.9906f, 0.6050f, 0.0302f, 0.6503f, 0.6413f, 0.0323f};
+
+  std::vector<float> output_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f,
+      1.1676f, -1.0190f, 0.3157f, -1.6036f, 1.8493f,
+      0.0447f, 1.5853f, 0.1036f, -0.3514f, 0.2421f,
+      0.6463f, 0.8730f, -0.9276f, 1.0311f, -1.9557f,
+      -0.1482f, 1.7376f, 2.2039f, -0.6589f, -0.8618f,
+      -0.0922f, -0.9073f, -0.7032f, -0.5762f, -0.2371f,
+      0.6923f, 1.1571f, 0.7572f, -1.1471f, -0.5302f,
+      -0.4391f, 0.5516f, 1.0461f, -0.4812f, -0.1443f,
+      -0.4862f, -0.6423f, 0.6740f, -0.4614f, 0.5475f,
+      1.1495f, 0.2389f, 0.8582f, -0.0259f, -0.6099f,
+      -0.2230f, 1.0963f, -1.5704f, -0.4595f, 0.9507f,
+      0.6696f, -0.7721f, -1.7415f, 1.2087f, -0.6387f,
+      -1.1052f, -0.5243f, -0.0400f, -0.4671f, 0.4909f,
+      -0.1931f, -0.1937f, -0.0447f, -0.3171f, 2.6839f,
+      -0.0076f, 1.5185f, 0.8465f, 0.3737f, 0.0242f,
+      -0.0703f, 1.1279f, 0.8862f, 1.2275f, -0.1786f,
+      -0.8767f, -1.8072f, -0.2630f, 0.9387f, -0.8021f,
+      0.7813f, 0.5001f, -1.4202f, -0.3850f, 0.9263f,
+      -0.0443f, -0.2323f, 0.5480f, 1.5696f, 0.6193f,
+      -1.1346f, 1.7878f, -0.5160f, 0.1192f, -2.1572f,
+      0.0460f, 1.1202f, -1.4812f, -0.9082f, 0.1728f,
+      -1.5132f, -0.4489f, 0.3370f, -0.1541f, -0.9266f,
+      0.2416f, 0.9270f, -1.1146f, 1.8758f, -0.4312f,
+      1.3714f, 1.2106f, -0.4272f, -0.8529f, 1.0328f,
+      1.8441f, 1.7698f, -0.7620f, 0.2168f, 0.1322f,
+      -0.2802f, 0.1460f, 2.1002f, 0.8437f, -0.1534f,
+      0.4321f, 0.8360f, 0.5955f, -1.5452f, -0.0491f,
+      -0.8794f, 0.2418f, -1.4203f, 0.3635f, 0.2362f,
+      0.3672f, -0.1128f, -0.8664f, -0.6354f, -1.4409f,
+      -0.3413f, -0.2409f, -0.3188f, 1.1054f, 0.4265f,
+      0.5867f, -1.3279f, 0.3201f, 0.0125f, 1.8157f,
+      1.0745f, 0.7372f, -0.2429f, 0.7100f, -0.4299f,
+      -0.2304f, 0.1645f, 0.9489f, -0.1816f, -0.5968f,
+      1.0394f, 0.0204f, 1.1786f, -0.3315f, -0.3997f,
+      -0.9304f, -1.4268f, -1.1526f, -0.1132f, 0.1490f,
+      1.3967f, -1.4634f, -0.1412f, -0.6339f, -1.5995f,
+      -0.1366f, 0.7604f, 0.1514f, 0.0824f, -1.1830f,
+      -1.6572f, 2.0099f, -0.9108f, -0.2256f, 0.4527f,
+      -1.8254f, 0.6475f, 0.8964f, 0.5717f, -0.2390f,
+      0.6983f, -1.3416f, 0.2715f, -0.2852f, 0.6051f,
+      0.2167f, -0.2181f, -1.6306f, 1.4788f, 0.2754f,
+      -0.0261f, -0.4618f, -0.5646f, -1.0389f, 0.5819f,
+      1.3697f, 0.0002f, 1.5333f, -1.0556f, -0.1254f,
+      0.1527f, -1.4979f, -1.1358f, 1.6320f, 0.2493f,
+      0.8266f, 0.3424f, -0.4992f, 0.2964f, 0.7298f,
+      1.8544f, 0.3516f, 0.0454f, 1.5415f, -0.2822f,
+      -2.0774f, 1.2323f, 0.3963f, -1.1503f, -0.4775f,
+      -1.9287f, -1.6164f, 0.3998f, 0.9020f, -0.0764f,
+      -1.8059f, -0.5762f, -1.4362f, -0.2706f, -1.0183f,
+      -0.4620f, 2.0891f, 0.1782f, 1.1591f, -0.8151f,
+      1.3000f, -1.2464f, -0.5099f, 0.5098f, -3.3525f,
+      0.4326f, 0.7414f, -0.7775f, -0.4271f, -0.3807f,
+      1.3245f, 2.4936f, 0.3139f, 1.0095f, 0.2323f,
+      0.8450f, -1.2244f, -0.4511f, 0.6266f, 0.9095f,
+      -1.7981f, 1.5241f, -0.4121f, 0.2341f, -0.4737f,
+      -1.3333f, -1.6150f, 0.4164f, 0.7100f, -0.2429f,
+      -0.5656f, 0.0863f, 0.0352f, -0.7227f, -1.3613f,
+      -0.0988f, -1.9114f, -0.3009f, 0.1435f, 0.7029f,
+      -0.3467f, 0.5092f, -0.0828f, 0.6253f, 0.7113f,
+      -1.2138f, 1.5964f, -0.8346f, -1.1515f, -0.7923f,
+      -0.8254f, -3.0038f, 2.4033f, -0.3398f, 0.0922f,
+      1.7053f, 1.1114f, 0.7462f, 2.3660f, -0.8409f,
+      -0.6654f, -0.6530f, -0.7899f, -1.0957f, -0.7149f,
+      -0.1072f, -0.1967f, -2.3416f, -1.2609f, -1.6375f,
+      -0.3576f, 0.9413f, -0.5694f, 0.3954f, 0.1383f,
+      -0.7477f, -0.8689f, 1.8286f, 0.8510f, -1.4793f,
+      -0.1597f, 0.8541f, 0.2380f, 1.4392f, -0.5644f,
+      0.3158f, -1.0686f, -0.1313f, -0.0181f, 0.2438f,
+      -0.8801f, 0.1413f, -0.3587f, 0.8002f, -0.5982f,
+      -1.4301f, -0.6620f, 0.7324f, -0.7250f, 0.0610f,
+      0.9293f, -0.6902f, -0.0125f, -0.2089f, -0.1664f,
+      0.5428f, 0.4245f, -0.7901f, 0.5665f, 0.9044f,
+      0.1948f, -0.1723f, 1.2705f, 1.0303f, 1.2202f,
+      1.3762f, -0.2959f, 0.7237f, -1.2077f, 0.7937f,
+      -0.6705f, 0.9287f, 1.0583f, 0.0496f, -1.3118f,
+      0.5556f, 0.0459f, -0.1324f, -0.5513f, -0.7409f,
+      -1.8002f, 0.9892f, 0.3619f, -1.4522f};
+
+  RunTests(input_data,
+           position_ids,
+           cos_cache,
+           sin_cache,
+           output_data,
+           batch_size,
+           sequence_length,
+           head_size,
+           num_heads,
+           max_sequence_length,
+           interleaved);
+}
+
+// Interleaved = false, pos ids shape = (batch_size, sequence_length)
+TEST(RotaryEmbeddingTest, RotaryEmbedding_NotInterleaved_SmallData_LlamaMSFT) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int num_heads = 3;
+  int head_size = 6;
+  int max_sequence_length = 4;
+  int64_t interleaved = 0;  // false
+
+  std::vector<float> input_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f, 1.1676f, 1.0076f, -0.7529f,
+      -0.2250f, -0.4327f, -1.5071f, -0.4586f, -0.8663f, -0.2656f, 0.1665f, 0.7911f,
+      -0.9320f, -0.8579f, -1.0574f, -0.1188f, -0.9078f, 0.3452f, -0.5713f, -0.2351f,
+      -0.8480f, 0.5266f, -1.2944f, -0.0243f, -0.2354f, -0.7087f, -0.9647f, -0.0991f,
+      -0.2994f, -0.0650f, -1.5720f, -1.3211f};
+
+  std::vector<int64_t> position_ids = {0, 1};
+
+  std::vector<float> cos_cache = {
+      1.0000f, 1.0000f, 1.0000f, 0.5403f, 0.9989f, 1.0000f, -0.4161f, 0.9957f,
+      1.0000f, -0.9900f, 0.9903f, 1.0000f};
+
+  std::vector<float> sin_cache = {
+      0.0000f, 0.0000f, 0.0000f, 0.8415f, 0.0464f, 0.0022f, 0.9093f, 0.0927f, 0.0043f,
+      0.1411f, 0.1388f, 0.0065f};
+
+  std::vector<float> output_data = {
+      -1.0408f, 0.9166f, -1.3042f, -1.1097f, -1.2188f, 1.1676f, 1.0076f, -0.7529f,
+      -0.2250f, -0.4327f, -1.5071f, -0.4586f, -0.8663f, -0.2656f, 0.1665f, 0.7911f,
+      -0.9320f, -0.8579f, -0.8618f, -0.0922f, -0.9073f, -0.7032f, -0.5762f, -0.2371f,
+      -0.4377f, 0.5370f, -1.2929f, -0.7267f, -0.2107f, -0.7115f, -0.4666f, -0.0261f,
+      -0.2965f, -0.8469f, -1.5749f, -1.3217f};
+
+  RunTests(input_data,
+           position_ids,
+           cos_cache,
+           sin_cache,
+           output_data,
+           batch_size,
+           sequence_length,
+           head_size,
+           num_heads,
+           max_sequence_length,
+           interleaved);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/function_test.cc b/onnxruntime/test/framework/function_test.cc
index 6e745776ab6b0..41274ee0dedfa 100644
--- a/onnxruntime/test/framework/function_test.cc
+++ b/onnxruntime/test/framework/function_test.cc
@@ -6,36 +6,47 @@
 #include "onnx/defs/parser.h"
 
 #include "core/common/span_utils.h"
-#include "core/framework/float8.h"
+#include "core/framework/customregistry.h"
+#include "core/framework/op_kernel.h"
 #include "core/graph/model.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/session/inference_session.h"
 
 #include "test/test_environment.h"
 #include "test/framework/test_utils.h"
+#include "inference_session_wrapper.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/util/include/asserts.h"
 
+#include "test/providers/internal_testing/internal_testing_execution_provider.h"
+
 // Unit tests to check the implementation of functions, model-local functions,
 // function-inlining etc.
 
 namespace onnxruntime {
 namespace test {
 
-static void Check(const char* source,
-                  const char* input_name, std::vector<float> input_values,
-                  const char* output_name, std::vector<float> output_values) {
-  // Convert source-representation of model to ModelProto:
+// Convert source-representation of model to ModelProto:
+static void ParseOnnxSource(const char* source, std::string& result) {
   ONNX_NAMESPACE::OnnxParser parser(source);
   ONNX_NAMESPACE::ModelProto model;
   auto parse_status = parser.Parse(model);
   ASSERT_TRUE(parse_status.IsOK()) << parse_status.ErrorMessage();
   ASSERT_TRUE(parser.EndOfInput()) << "Extra unparsed input unexpected.";
 
-  // Serialize and then load model:
+  // Serialize
   std::string serialized_model;
   const bool serialization_status = model.SerializeToString(&serialized_model);
   ASSERT_TRUE(serialization_status) << "Failed to serialize proto to string";
+  result = std::move(serialized_model);
+}
+
+static void Check(const char* source,
+                  const char* input_name, std::vector<float> input_values,
+                  const char* output_name, std::vector<float> output_values) {
+  // Serialize and then load model:
+  std::string serialized_model;
+  ParseOnnxSource(source, serialized_model);
 
   SessionOptions session_options;
   InferenceSession session_object{session_options, GetEnvironment()};
@@ -76,8 +87,8 @@ static void Check(const char* source,
   }
 }
 
-TEST(FunctionTest, Basic) {
-  const char* code = R"(
+namespace {
+const char* basic_code = R"(
         <
         ir_version: 8,
         opset_import: [ "" : 16, "local" : 1 ]
@@ -96,8 +107,10 @@ TEST(FunctionTest, Basic) {
             ly = Mul (lx, two)
         }
         )";
+}
 
-  Check(code, "x", {1.0, 2.0, 3.0}, "y", {2.0, 4.0, 6.0});
+TEST(FunctionTest, Basic) {
+  Check(basic_code, "x", {1.0, 2.0, 3.0}, "y", {2.0, 4.0, 6.0});
 }
 
 // Check that variables are renamed to avoid conflicts when multiple
@@ -521,5 +534,56 @@ TEST(FunctionTest, ConstantFoldingInSubGraph) {
   Check(code, "X", {1.0, 2.0, 3.0}, "Y", {3.0, 4.0, 5.0, 3.0, 4.0, 5.0, 3.0, 4.0, 5.0});
 }
 
+TEST(FunctionTest, TestInlinedLocalFunctionRemoved) {
+  std::string serialized_model;
+  ParseOnnxSource(basic_code, serialized_model);
+
+  // Default is to do AOT Function inlining
+  SessionOptions session_options;
+  InferenceSessionWrapper session_object{session_options, GetEnvironment()};
+
+  std::stringstream sstr(serialized_model);
+  auto status = session_object.Load(sstr);
+  ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  auto model_proto = session_object.GetModel().ToProto();
+  ASSERT_EQ(1, model_proto.functions_size());
+
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  // All functions removed
+  model_proto = session_object.GetModel().ToProto();
+  ASSERT_EQ(0, model_proto.functions_size());
+}
+
+TEST(FunctionTest, TestInlinedLocalFunctionNotRemoved) {
+  std::string serialized_model;
+  ParseOnnxSource(basic_code, serialized_model);
+
+  // Default is to do AOT Function inlining
+  SessionOptions session_options;
+  InferenceSessionWrapper session_object{session_options, GetEnvironment()};
+
+  using InternalTestingEP = onnxruntime::internal_testing_ep::InternalTestingExecutionProvider;
+  const std::unordered_set<std::string> empty_set;
+  auto internal_testing_ep = std::make_unique<InternalTestingEP>(empty_set, empty_set, DataLayout::NCHW);
+  internal_testing_ep->EnableStaticKernels().TakeAllNodes();
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(internal_testing_ep)));
+
+  std::stringstream sstr(serialized_model);
+  ASSERT_STATUS_OK(session_object.Load(sstr));
+
+  auto model_proto = session_object.GetModel().ToProto();
+  ASSERT_EQ(1, model_proto.functions_size());
+
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  // myfun is not removed because it was claimed by InternalTestingEP
+  model_proto = session_object.GetModel().ToProto();
+  ASSERT_EQ(1, model_proto.functions_size());
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index bc88f69fa990f..47c3798721679 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -944,6 +944,20 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
       {"simple_rnn_batchwise", "type error", {}},
       {"mod_float_mixed_sign_example", "fmod attribute must be true for floating point types", {}},
       {"col2im_pads", "result mismatch", {"opset18"}},
+      {"gridsample_volumetric_nearest_align_corners_0", "result differs", {}},
+      {"gridsample_volumetric_nearest_align_corners_1", "result differs", {}},
+      {"reduce_l1_empty_set", "unknown version", {}},
+      {"reduce_l1_empty_set_expanded", "unknown version", {}},
+      {"reduce_l2_empty_set", "unknown version", {}},
+      {"reduce_l2_empty_set_expanded", "unknown version", {}},
+      {"reduce_log_sum_empty_set", "unknown version", {}},
+      {"reduce_log_sum_empty_set_expanded", "unknown version", {}},
+      {"reduce_log_sum_exp_empty_set", "unknown version", {}},
+      {"reduce_log_sum_exp_empty_set_expanded", "unknown version", {}},
+      {"reduce_prod_empty_set", "unknown version", {}},
+      {"reduce_sum_empty_set", "unknown version", {}},
+      {"reduce_sum_square_empty_set", "unknown version", {}},
+      {"reduce_sum_square_empty_set_expanded", "unknown version", {}},
 #ifdef ENABLE_TRAINING_CORE
       {"adagrad", "not a registered function/op", {}},                  // Op not registered.
       {"adagrad_multiple", "not a registered function/op", {}},         // Op not registered.
@@ -1339,6 +1353,7 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     broken_tests->insert({"gridsample_reflection_padding", "result differs"});
     broken_tests->insert({"spacetodepth", "result differs"});
   }
+
 #ifdef DISABLE_CONTRIB_OPS
   broken_tests->insert({"coreml_SqueezeNet_ImageNet", "This model uses contrib ops."});
   broken_tests->insert({"keras2coreml_Permute_ImageNet", "This model uses contrib ops."});
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 6acf631d53cd9..46b95a127b75c 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -31,6 +31,7 @@
 #include "core/optimizer/conv_add_act_fusion.h"
 #include "core/optimizer/conv_add_fusion.h"
 #include "core/optimizer/conv_bn_fusion.h"
+#include "core/optimizer/matmul_bn_fusion.h"
 #include "core/optimizer/conv_mul_fusion.h"
 #include "core/optimizer/div_mul_fusion.h"
 #include "core/optimizer/dropout_elimination.h"
@@ -1079,6 +1080,268 @@ TEST_F(GraphTransformationTests, FuseConvBNNoBias) {
   }
 }
 
+TEST_F(GraphTransformationTests, FuseMatmulBNWithInBetweenNodes) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-with-reshape.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::string expected_output_name;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "MatMul") {
+      expected_output_name = node.OutputDefs()[0]->Name();
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["BatchNormalization"], 0);
+  ASSERT_EQ(op_to_count["MatMul"], 0);
+  ASSERT_EQ(op_to_count["Gemm"], 1);
+
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "Gemm") {
+      ASSERT_EQ(node.OutputDefs()[0]->Name(), expected_output_name)
+          << "fusion should produce the same output name as the MatMul node";
+    }
+  }
+}
+
+TEST_F(GraphTransformationTests, FuseMatmulBNWithEmptyOptionalOutputWithInBetweenNodes) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-with-reshape.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::string expected_output_name;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "MatMul") {
+      expected_output_name = node.OutputDefs()[0]->Name();
+    } else if (node.OpType() == "BatchNormalization") {
+      node.MutableOutputDefs().push_back(&graph.GetOrCreateNodeArg("", nullptr));
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["BatchNormalization"], 0);
+  ASSERT_EQ(op_to_count["MatMul"], 0);
+  ASSERT_EQ(op_to_count["Gemm"], 1);
+
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "Gemm") {
+      ASSERT_EQ(node.OutputDefs()[0]->Name(), expected_output_name)
+          << "fusion should produce the same output name as the MatMul node";
+    }
+  }
+}
+
+// should not fuse
+TEST_F(GraphTransformationTests, FuseMatmulBNWithOptionalOutputWithInBetweenNodes) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-with-reshape.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "BatchNormalization") {
+      // additional non-empty output to batchNormalization
+      ONNX_NAMESPACE::TypeProto optional_output_tensor_type;
+      optional_output_tensor_type.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TypeProto::kTensorType);
+      auto& arg = graph.GetOrCreateNodeArg("bn_optional_output", &optional_output_tensor_type);
+      node.MutableOutputDefs().push_back(&arg);
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["BatchNormalization"], 1);
+  ASSERT_EQ(op_to_count["MatMul"], 1);
+  ASSERT_EQ(op_to_count["Gemm"], 0);
+}
+
+TEST_F(GraphTransformationTests, FuseMatmulBNDirectly) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-directly.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::string expected_output_name;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "BatchNormalization") {
+      expected_output_name = node.OutputDefs()[0]->Name();
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["BatchNormalization"], 0);
+  ASSERT_EQ(op_to_count["MatMul"], 0);
+  ASSERT_EQ(op_to_count["Gemm"], 1);
+
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "Gemm") {
+      ASSERT_EQ(node.OutputDefs()[0]->Name(), expected_output_name)
+          << "fusion should produce the same output name as the last node";
+    }
+  }
+}
+
+TEST_F(GraphTransformationTests, FuseMatmulBNWithOnlyReshape) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-only-reshape.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::string expected_output_name;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "MatMul") {
+      expected_output_name = node.OutputDefs()[0]->Name();
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["BatchNormalization"], 0);
+  ASSERT_EQ(op_to_count["MatMul"], 0);
+  ASSERT_EQ(op_to_count["Gemm"], 1);
+
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "Gemm") {
+      ASSERT_EQ(node.OutputDefs()[0]->Name(), expected_output_name)
+          << "fusion should produce the same output name as the MatMul node";
+    }
+  }
+}
+
+TEST_F(GraphTransformationTests, FuseMatmulBNWithOnlyTranspose) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-only-transpose.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::string expected_output_name;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "MatMul") {
+      expected_output_name = node.OutputDefs()[0]->Name();
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["BatchNormalization"], 0);
+  ASSERT_EQ(op_to_count["MatMul"], 0);
+  ASSERT_EQ(op_to_count["Gemm"], 1);
+
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "Gemm") {
+      ASSERT_EQ(node.OutputDefs()[0]->Name(), expected_output_name)
+          << "fusion should produce the same output name as the MatMul node";
+    }
+  }
+}
+
+TEST_F(GraphTransformationTests, FuseMatmulBNWithoutBatchNormalization) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-only-transpose.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "BatchNormalization") {
+      graph_utils::RemoveNode(graph, node);
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["MatMul"], 1);
+}
+
+// should not fuse
+TEST_F(GraphTransformationTests, FuseMatmulBNWithNonIgnorableNode) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-non-ignorable-node.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<MatmulBNFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["BatchNormalization"], 1);
+  ASSERT_EQ(op_to_count["MatMul"], 1);
+  ASSERT_EQ(op_to_count["Gemm"], 0);
+}
+
 TEST_F(GraphTransformationTests, DontFuseConvWithBNWithOptionalOutputs) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-conv-bn-no-bias.onnx";
 
diff --git a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
new file mode 100644
index 0000000000000..e37e784f28930
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
@@ -0,0 +1,165 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/util/math.h"
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+TEST(AffineGridTest, 2d) {
+  OpTester test("AffineGrid", 20);
+  test.AddInput<float>("theta", {1, 2, 3}, {1.0f, 0.0, 0.0f, 0.0f, 1.0, 0.0f});
+  test.AddInput<int64_t>("size", {4}, {1, 1, 2, 3});
+  test.AddOutput<float>("grid", {1, 2, 3, 2},
+                        {-0.6667f, -0.5000f, 0.0000f, -0.5000f, 0.6667f, -0.5000f, -0.6667f, 0.5000f, 0.0000f, 0.5000f, 0.6667f, 0.5000f});
+  test.Run();
+}
+
+// following tests code is generated with:
+// python onnxruntime/test/providers/cpu/tensor/affine_grid_test_gen.py
+TEST(AffineGridTest, test_2d_0) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)0);
+  test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
+  test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
+  test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.3228f, -0.9151f, 1.1544f, -0.7414f, -0.4386f, -0.5868f, 1.0386f, -0.4132f, -0.5544f, -0.2586f, 0.9228f, -0.0849f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_2d_1) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)0);
+  test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
+  test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
+  test.AddOutput<float>("grid", {2, 2, 3, 2}, {-0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f, -0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_2d_2) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)0);
+  test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
+  test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
+  test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.6726f, -2.7663f, 0.8274f, -1.9003f, -1.2500f, -0.9330f, 0.2500f, -0.0670f, -1.8274f, 0.9003f, -0.3274f, 1.7663f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_2d_3) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)0);
+  test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
+  test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
+  test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f, -1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_2d_4) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)1);
+  test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
+  test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
+  test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.0036f, -1.1661f, 1.9509f, -0.8188f, -1.1772f, -0.6736f, 1.7772f, -0.3264f, -1.3509f, -0.1812f, 1.6036f, 0.1661f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_2d_5) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)1);
+  test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
+  test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
+  test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f, -1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_2d_6) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)1);
+  test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
+  test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
+  test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.1340f, -4.1160f, 1.8660f, -2.3840f, -2.0000f, -1.3660f, 1.0000f, 0.3660f, -2.8660f, 1.3840f, 0.1340f, 3.1160f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_2d_7) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)1);
+  test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
+  test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
+  test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f, -1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_3d_0) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)0);
+  test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
+  test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
+  test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.7468f, -1.3266f, 1.5323f, 0.6627f, -1.2078f, 1.3639f, -0.7468f, 0.6430f, 1.6191f, 0.6627f, 0.7618f, 1.4507f, -0.4048f, -1.5442f, 1.8408f, 1.0048f, -1.4254f, 1.6724f, -0.4048f, 0.4254f, 1.9276f, 1.0048f, 0.5442f, 1.7592f, -0.0627f, -1.7618f, 2.1493f, 1.3468f, -1.6430f, 1.9809f, -0.0627f, 0.2078f, 2.2361f, 1.3468f, 0.3266f, 2.0677f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_3d_1) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)0);
+  test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
+  test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
+  test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f, -0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_3d_2) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)0);
+  test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
+  test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
+  test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.5299f, 0.8995f, -4.3568f, -0.2701f, -0.3995f, -2.9818f, -0.5299f, 2.3995f, 0.4064f, -0.2701f, 1.1005f, 1.7814f, -0.6299f, -0.6005f, -2.7691f, -0.3701f, -1.8995f, -1.3941f, -0.6299f, 0.8995f, 1.9941f, -0.3701f, -0.3995f, 3.3691f, -0.7299f, -2.1005f, -1.1814f, -0.4701f, -3.3995f, 0.1936f, -0.7299f, -0.6005f, 3.5818f, -0.4701f, -1.8995f, 4.9568f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_3d_3) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)0);
+  test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
+  test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
+  test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f, -0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_3d_4) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)1);
+  test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
+  test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
+  test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-1.6226f, -2.2620f, 1.4189f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, 1.1965f, 1.9147f, 1.2557f, -1.1095f, -2.5884f, 1.8816f, 1.7095f, -2.3508f, 1.5448f, -1.1095f, 1.3508f, 2.0552f, 1.7095f, 1.5884f, 1.7184f, -0.5965f, -2.9147f, 2.3443f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 2.2226f, 1.2620f, 2.1811f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_3d_5) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)1);
+  test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
+  test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
+  test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f, -1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_3d_6) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)1);
+  test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
+  test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
+  test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.0902f, 1.9510f, 4.0566f, -0.7598f, -0.7010f, -5.8381f, -0.2402f, -3.2990f, -3.0881f, -0.7598f, 2.2990f, 3.6881f, -0.2402f, -0.2990f, 6.4381f, -0.9098f, -2.9510f, -3.4566f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.3902f, -2.5490f, 8.8197f});
+  test.Run();
+}
+
+TEST(AffineGridTest, test_3d_7) {
+  OpTester test("AffineGrid", 20);
+  test.AddAttribute("align_corners", (int64_t)1);
+  test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
+  test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
+  test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f, -0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f});
+  test.Run();
+}
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/affine_grid_test_gen.py b/onnxruntime/test/providers/cpu/tensor/affine_grid_test_gen.py
new file mode 100644
index 0000000000000..22bad6f1be534
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/tensor/affine_grid_test_gen.py
@@ -0,0 +1,111 @@
+import argparse
+
+import numpy as np
+import torch
+from torch.nn.functional import affine_grid
+
+opset_version = 20
+parser = argparse.ArgumentParser(description="Generate test cases for the AffineGrid operator.")
+parser.add_argument("--dim", type=int, choices=[2, 3], help="Dimension of the test cases (2 or 3)")
+args = parser.parse_args()
+
+if args.dim is None or args.dim == 2:
+    align_corners_options = [False, True]
+    angles = [10, 60]
+    translations = [np.array([0.3, -0.5]), np.array([-0.5, -0.5])]
+    scales = [np.array([1.5, 0.5]), np.array([3.0, 5.5])]
+    sizes = [[1, 1, 3, 2], [2, 10, 2, 3]]
+    test_count = 0
+
+    for align_corners in align_corners_options:
+        for angle, translation, scale in zip(angles, translations, scales):
+            for size in sizes:
+                theta = np.array([], dtype=np.float32)
+                for _ in range(size[0]):
+                    angle_radian = (angle / 180.0) * np.pi
+                    theta = np.append(
+                        theta,
+                        [
+                            np.cos(angle_radian) * scale[0],
+                            -np.sin(angle_radian),
+                            translation[0],
+                            np.sin(angle_radian),
+                            np.cos(angle_radian) * scale[1],
+                            translation[1],
+                        ],
+                    )
+                theta = theta.reshape(size[0], 2, 3)
+                theta = torch.Tensor(theta)
+                grid = affine_grid(theta, size, align_corners=align_corners)
+
+                # Print the C++ code for the test case
+                print(f"TEST(AffineGridTest, test_2d_{test_count}) {{")
+                print(f'  OpTester test("AffineGrid", {opset_version});')
+                print(f'  test.AddAttribute("align_corners", (int64_t){1 if align_corners else 0});')
+                print(
+                    f"  test.AddInput<float>(\"theta\", {{{theta.shape[0]}, {theta.shape[1]}, {theta.shape[2]}}}, {{{', '.join([f'{x:.6f}f' for x in theta.flatten()])}}});"
+                )
+                print(
+                    f'  test.AddInput<int64_t>("size", {{{len(size)}}}, {{{size[0]}, {size[1]}, {size[2]}, {size[3]}}});'
+                )
+                print(
+                    f"  test.AddOutput<float>(\"grid\", {{{size[0]}, {size[2]}, {size[3]}, 2}}, {{{', '.join([f'{x:.4f}f' for x in grid.flatten()])}}});"
+                )
+                print("  test.Run();")
+                print("}\n")
+                test_count += 1
+
+
+if args.dim is None or args.dim == 3:
+    align_corners_options = [False, True]
+    angles = [[10, 20], [60, -30]]
+    translations = [np.array([0.3, -0.5, 1.8]), np.array([-0.5, -0.5, 0.3])]
+    scales = [np.array([1.5, 2.0, 0.5]), np.array([0.3, 3.0, 5.5])]
+    sizes = [[1, 1, 3, 2, 2], [2, 10, 2, 2, 3]]
+    test_count = 0
+
+    for align_corners in align_corners_options:
+        for angle, translation, scale in zip(angles, translations, scales):
+            for size in sizes:
+                theta = np.array([], dtype=np.float32)
+                for _ in range(size[0]):
+                    angle_radian_x = (angle[0] / 180.0) * np.pi
+                    angle_radian_y = (angle[1] / 180.0) * np.pi
+                    rot_matrix_x = np.array(
+                        [
+                            [1, 0, 0],
+                            [0, np.cos(angle_radian_x), -np.sin(angle_radian_x)],
+                            [0, np.sin(angle_radian_x), np.cos(angle_radian_x)],
+                        ]
+                    )
+                    rot_matrix_y = np.array(
+                        [
+                            [np.cos(angle_radian_y), 0, np.sin(angle_radian_y)],
+                            [0, 1, 0],
+                            [-np.sin(angle_radian_y), 0, np.cos(angle_radian_y)],
+                        ]
+                    )
+                    rot_matrix = np.matmul(rot_matrix_x, rot_matrix_y)
+                    rot_matrix = rot_matrix * scale.reshape(3, 1)
+                    rot_matrix = np.append(rot_matrix, np.reshape(translation, (3, 1)), axis=1)
+                    theta = np.append(theta, rot_matrix.flatten())
+                theta = theta.reshape(size[0], 3, 4)
+                theta = torch.Tensor(theta)
+                grid = affine_grid(theta, size, align_corners=align_corners)
+
+                # Print the C++ code for the test case
+                print(f"TEST(AffineGridTest, test_3d_{test_count}) {{")
+                print(f'  OpTester test("AffineGrid", {opset_version});')
+                print(f'  test.AddAttribute("align_corners", (int64_t){1 if align_corners else 0});')
+                print(
+                    f"  test.AddInput<float>(\"theta\", {{{theta.shape[0]}, {theta.shape[1]}, {theta.shape[2]}}}, {{{', '.join([f'{x:.6f}f' for x in theta.flatten()])}}});"
+                )
+                print(
+                    f'  test.AddInput<int64_t>("size", {{{len(size)}}}, {{{size[0]}, {size[1]}, {size[2]}, {size[3]}, {size[4]}}});'
+                )
+                print(
+                    f"  test.AddOutput<float>(\"grid\", {{{size[0]}, {size[2]}, {size[3]}, {size[4]}, 3}}, {{{', '.join([f'{x:.4f}f' for x in grid.flatten()])}}});"
+                )
+                print("  test.Run();")
+                print("}\n")
+                test_count += 1
diff --git a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
index ddb392eb82e13..2e583c5d2547b 100644
--- a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
@@ -17,85 +17,137 @@ constexpr double DOUBLE_INF = std::numeric_limits<double>::infinity();
 constexpr double DOUBLE_NINF = -std::numeric_limits<double>::infinity();
 constexpr double DOUBLE_NAN = std::numeric_limits<double>::quiet_NaN();
 
-TEST(IsInfTest, test_isinf_float) {
-  // Defaults for detect_negative = 1
-  // detect_positive = 1
-  OpTester test("IsInf", 10);
+template <typename T>
+void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list<T>& input, const std::initializer_list<bool>& output) {
+  OpTester test("IsInf", opset);
+  test.AddAttribute<int64_t>("detect_positive", detect_positive);
+  test.AddAttribute<int64_t>("detect_negative", detect_negative);
+  test.AddInput<T>("X", {onnxruntime::narrow<int64_t>(input.size())}, input);
+  test.AddOutput<bool>("Y", {onnxruntime::narrow<int64_t>(output.size())}, output);
+  test.Run();
+}
 
-  std::vector<int64_t> input_dim{6};
-  std::vector<float> input = {-1.2f, FLOAT_NAN, FLOAT_INF, 2.8f, FLOAT_NINF, FLOAT_INF};
-  test.AddInput<float>("X", input_dim, input);
+TEST(IsInfTest, test_isinf_float10) {
+  std::initializer_list<float> input = {-1.2f, FLOAT_NAN, FLOAT_INF, 2.8f, FLOAT_NINF, FLOAT_INF};
+  std::initializer_list<bool> output = {false, false, true, false, true, true};
+  run_is_inf_test(10, 1, 1, input, output);
+}
 
-  std::vector<int64_t> output_dim(input_dim);
-  test.AddOutput<bool>("Y", output_dim, {false, false, true, false, true, true});
-  test.Run();
+TEST(IsInfTest, test_isinf_float20) {
+  std::initializer_list<float> input = {-1.2f, FLOAT_NAN, FLOAT_INF, 2.8f, FLOAT_NINF, FLOAT_INF};
+  std::initializer_list<bool> output = {false, false, true, false, true, true};
+  run_is_inf_test(20, 1, 1, input, output);
 }
 
-TEST(IsInfTest, test_isinf_double) {
-  // Defaults for detect_negative = 1
-  // detect_positive = 1
-  OpTester test("IsInf", 10);
+TEST(IsInfTest, test_isinf_double10) {
+  std::initializer_list<double> input = {-1.2, DOUBLE_NAN, DOUBLE_INF, 2.8, DOUBLE_NINF, DOUBLE_INF};
+  std::initializer_list<bool> output = {false, false, true, false, true, true};
+  run_is_inf_test(10, 1, 1, input, output);
+}
 
-  std::vector<int64_t> input_dim{6};
-  std::vector<double> input = {-1.2, DOUBLE_NAN, DOUBLE_INF, 2.8, DOUBLE_NINF, DOUBLE_INF};
-  test.AddInput<double>("X", input_dim, input);
+TEST(IsInfTest, test_isinf_double20) {
+  std::initializer_list<double> input = {-1.2, DOUBLE_NAN, DOUBLE_INF, 2.8, DOUBLE_NINF, DOUBLE_INF};
+  std::initializer_list<bool> output = {false, false, true, false, true, true};
+  run_is_inf_test(20, 1, 1, input, output);
+}
 
-  std::vector<int64_t> output_dim(input_dim);
-  test.AddOutput<bool>("Y", output_dim, {false, false, true, false, true, true});
-  test.Run();
+TEST(IsInfTest, test_isinf_positive_float10) {
+  std::initializer_list<double> input = {-1.7f, FLOAT_NAN, FLOAT_INF, 3.6f, FLOAT_NINF, FLOAT_INF};
+  std::initializer_list<bool> output = {false, false, true, false, false, true};
+  run_is_inf_test(10, 1, 0, input, output);
 }
 
-TEST(IsInfTest, test_isinf_positive_float) {
-  OpTester test("IsInf", 10);
-  test.AddAttribute<int64_t>("detect_negative", 0);
+TEST(IsInfTest, test_isinf_positive_float20) {
+  std::initializer_list<double> input = {-1.7f, FLOAT_NAN, FLOAT_INF, 3.6f, FLOAT_NINF, FLOAT_INF};
+  std::initializer_list<bool> output = {false, false, true, false, false, true};
+  run_is_inf_test(20, 1, 0, input, output);
+}
 
-  std::vector<int64_t> input_dim{6};
-  std::vector<float> input = {-1.7f, FLOAT_NAN, FLOAT_INF, 3.6f, FLOAT_NINF, FLOAT_INF};
-  test.AddInput<float>("X", input_dim, input);
+TEST(IsInfTest, test_isinf_positive_double10) {
+  std::initializer_list<double> input = {-1.7, DOUBLE_NAN, DOUBLE_INF, 3.6, DOUBLE_NINF, DOUBLE_INF};
+  std::initializer_list<bool> output = {false, false, true, false, false, true};
+  run_is_inf_test(10, 1, 0, input, output);
+}
 
-  std::vector<int64_t> output_dim(input_dim);
-  test.AddOutput<bool>("Y", output_dim, {false, false, true, false, false, true});
-  test.Run();
+TEST(IsInfTest, test_isinf_positive_double20) {
+  std::initializer_list<double> input = {-1.7, DOUBLE_NAN, DOUBLE_INF, 3.6, DOUBLE_NINF, DOUBLE_INF};
+  std::initializer_list<bool> output = {false, false, true, false, false, true};
+  run_is_inf_test(20, 1, 0, input, output);
+}
+
+TEST(IsInfTest, test_isinf_negative_float10) {
+  std::initializer_list<float> input = {-1.7f, FLOAT_NAN, FLOAT_INF, 3.6f, FLOAT_NINF, FLOAT_INF};
+  std::initializer_list<bool> output = {false, false, false, false, true, false};
+  run_is_inf_test(10, 0, 1, input, output);
 }
 
-TEST(IsInfTest, test_isinf_positive_double) {
-  OpTester test("IsInf", 10);
-  test.AddAttribute<int64_t>("detect_negative", 0);
+TEST(IsInfTest, test_isinf_negative_float20) {
+  std::initializer_list<float> input = {-1.7f, FLOAT_NAN, FLOAT_INF, 3.6f, FLOAT_NINF, FLOAT_INF};
+  std::initializer_list<bool> output = {false, false, false, false, true, false};
+  run_is_inf_test(20, 0, 1, input, output);
+}
 
-  std::vector<int64_t> input_dim{6};
-  std::vector<double> input = {-1.7, DOUBLE_NAN, DOUBLE_INF, 3.6, DOUBLE_NINF, DOUBLE_INF};
-  test.AddInput<double>("X", input_dim, input);
+TEST(IsInfTest, test_isinf_negative_double10) {
+  std::initializer_list<double> input = {-1.7, DOUBLE_NAN, DOUBLE_INF, 3.6, DOUBLE_NINF, DOUBLE_INF};
+  std::initializer_list<bool> output = {false, false, false, false, true, false};
+  run_is_inf_test(10, 0, 1, input, output);
+}
 
-  std::vector<int64_t> output_dim(input_dim);
-  test.AddOutput<bool>("Y", output_dim, {false, false, true, false, false, true});
-  test.Run();
+TEST(IsInfTest, test_isinf_negative_double20) {
+  std::initializer_list<double> input = {-1.7, DOUBLE_NAN, DOUBLE_INF, 3.6, DOUBLE_NINF, DOUBLE_INF};
+  std::initializer_list<bool> output = {false, false, false, false, true, false};
+  run_is_inf_test(20, 0, 1, input, output);
 }
 
-TEST(IsInfTest, test_isinf_negative_float) {
-  OpTester test("IsInf", 10);
-  test.AddAttribute<int64_t>("detect_positive", 0);
+#if !defined(DISABLE_FLOAT8_TYPES)
+TEST(IsInfTest, test_Float8E4M3FN) {
+  std::initializer_list<Float8E4M3FN> input = {
+      Float8E4M3FN(-1.0f), Float8E4M3FN(FLOAT_NAN, false), Float8E4M3FN(1.0f), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_INF, false)};
+  std::initializer_list<bool> output = {false, false, false, false, false, false};
+  run_is_inf_test(20, 1, 1, input, output);
+}
 
-  std::vector<int64_t> input_dim{6};
-  std::vector<float> input = {-1.7f, FLOAT_NAN, FLOAT_INF, 3.6f, FLOAT_NINF, FLOAT_INF};
-  test.AddInput<float>("X", input_dim, input);
+TEST(IsInfTest, test_Float8E4M3FNUZ) {
+  std::initializer_list<Float8E4M3FNUZ> input = {
+      Float8E4M3FNUZ(-1.0f), Float8E4M3FNUZ(FLOAT_NAN, false), Float8E4M3FNUZ(1.0f), Float8E4M3FNUZ(FLOAT_NINF, false), Float8E4M3FNUZ(FLOAT_NINF, false), Float8E4M3FNUZ(FLOAT_INF, false)};
+  std::initializer_list<bool> output = {false, false, false, false, false, false};
+  run_is_inf_test(20, 1, 1, input, output);
+}
 
-  std::vector<int64_t> output_dim(input_dim);
-  test.AddOutput<bool>("Y", output_dim, {false, false, false, false, true, false});
-  test.Run();
+TEST(IsInfTest, test_Float8E5M2_detect_both) {
+  std::initializer_list<Float8E5M2> input = {
+      Float8E5M2(-1.0f), Float8E5M2(FLOAT_NINF, false), Float8E5M2(1.0f), Float8E5M2(FLOAT_NINF, false), Float8E5M2(FLOAT_NAN, false), Float8E5M2(FLOAT_INF, false)};
+  std::initializer_list<bool> output = {false, true, false, true, false, true};
+  run_is_inf_test(20, 1, 1, input, output);
 }
 
-TEST(IsInfTest, test_isinf_negative_double) {
-  OpTester test("IsInf", 10);
-  test.AddAttribute<int64_t>("detect_positive", 0);
+TEST(IsInfTest, test_Float8E5M2_detect_positive) {
+  std::initializer_list<Float8E5M2> input = {
+      Float8E5M2(-1.0f), Float8E5M2(FLOAT_NINF, false), Float8E5M2(1.0f), Float8E5M2(FLOAT_NINF, false), Float8E5M2(FLOAT_NAN, false), Float8E5M2(FLOAT_INF, false)};
+  std::initializer_list<bool> output = {false, false, false, false, false, true};
+  run_is_inf_test(20, 1, 0, input, output);
+}
 
-  std::vector<int64_t> input_dim{6};
-  std::vector<double> input = {-1.7, DOUBLE_NAN, DOUBLE_INF, 3.6, DOUBLE_NINF, DOUBLE_INF};
-  test.AddInput<double>("X", input_dim, input);
+TEST(IsInfTest, test_Float8E5M2_detect_negative) {
+  std::initializer_list<Float8E5M2> input = {
+      Float8E5M2(-1.0f), Float8E5M2(FLOAT_NINF, false), Float8E5M2(1.0f), Float8E5M2(FLOAT_NINF, false), Float8E5M2(FLOAT_NAN, false), Float8E5M2(FLOAT_INF, false)};
+  std::initializer_list<bool> output = {false, true, false, true, false, false};
+  run_is_inf_test(20, 0, 1, input, output);
+}
 
-  std::vector<int64_t> output_dim(input_dim);
-  test.AddOutput<bool>("Y", output_dim, {false, false, false, false, true, false});
-  test.Run();
+TEST(IsInfTest, test_Float8E5M2_none) {
+  std::initializer_list<Float8E5M2> input = {
+      Float8E5M2(-1.0f), Float8E5M2(FLOAT_NINF, false), Float8E5M2(1.0f), Float8E5M2(FLOAT_NINF, false), Float8E5M2(FLOAT_NAN, false), Float8E5M2(FLOAT_INF, false)};
+  std::initializer_list<bool> output = {false, false, false, false, false, false};
+  run_is_inf_test(20, 0, 0, input, output);
 }
 
+TEST(IsInfTest, test_Float8E5M2FNUZ) {
+  std::initializer_list<Float8E5M2FNUZ> input = {
+      Float8E5M2FNUZ(-1.0f), Float8E5M2FNUZ(FLOAT_NINF, false), Float8E5M2FNUZ(1.0f), Float8E5M2FNUZ(FLOAT_NINF, false), Float8E5M2FNUZ(FLOAT_NAN, false), Float8E5M2FNUZ(FLOAT_INF, false)};
+  std::initializer_list<bool> output = {false, false, false, false, false, false};
+  run_is_inf_test(20, 1, 1, input, output);
+}
+#endif
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/isnan_test.cc b/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
index 0dffc452b519d..0f1e5c07cdd9b 100644
--- a/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
@@ -9,29 +9,84 @@
 namespace onnxruntime {
 namespace test {
 
-TEST(IsNaNOpTest, IsNaNFloat) {
-  OpTester test("IsNaN", 9, kOnnxDomain);
-  std::vector<int64_t> dims{2, 2};
-  test.AddInput<float>("X", dims, {1.0f, NAN, 2.0f, NAN});
-  test.AddOutput<bool>("Y", dims, {false, true, false, true});
+template <typename T>
+void run_is_nan_test(int opset, const std::vector<int64_t>& dims, const std::initializer_list<T>& input, const std::initializer_list<bool>& output) {
+  OpTester test("IsNaN", opset, kOnnxDomain);
+  test.AddInput<T>("X", dims, input);
+  test.AddOutput<bool>("Y", dims, output);
   test.Run();
 }
 
-TEST(IsNaNOpTest, IsNaNFloat16) {
-  OpTester test("IsNaN", 9, kOnnxDomain);
+TEST(IsNaNOpTest, IsNaNFloat9) {
   std::vector<int64_t> dims{2, 2};
-  test.AddInput<MLFloat16>("X", dims, std::initializer_list<MLFloat16>({MLFloat16(1.0f), MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN}));
-  test.AddOutput<bool>("Y", dims, {false, true, false, true});
-  test.Run();
+  std::initializer_list<float> input = {1.0f, NAN, 2.0f, NAN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(9, dims, input, output);
 }
 
-TEST(IsNaNOpTest, IsNaNDouble) {
-  OpTester test("IsNaN", 9, kOnnxDomain);
+TEST(IsNaNOpTest, IsNaNFloat20) {
   std::vector<int64_t> dims{2, 2};
-  test.AddInput<double>("X", dims, {1.0, NAN, 2.0, NAN});
-  test.AddOutput<bool>("Y", dims, {false, true, false, true});
-  test.Run();
+  std::initializer_list<float> input = {1.0f, NAN, 2.0f, NAN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaNFloat16_9) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<MLFloat16> input = {MLFloat16(1.0f), MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(9, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaNFloat16_20) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<MLFloat16> input = {MLFloat16(1.0f), MLFloat16::NaN, MLFloat16(2.0f), MLFloat16::NaN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaNDouble9) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<double> input = {1.0, NAN, 2.0, NAN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(9, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaNDouble20) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<double> input = {1.0, NAN, 2.0, NAN};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
 }
 
+#if !defined(DISABLE_FLOAT8_TYPES)
+TEST(IsNaNOpTest, IsNaNFloat8E4M3FN) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<Float8E4M3FN> input = {Float8E4M3FN(1.0f), Float8E4M3FN(-NAN), Float8E4M3FN(2.0f), Float8E4M3FN(NAN)};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaN_Float8E4M3FNUZ) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<Float8E4M3FNUZ> input = {Float8E4M3FNUZ(1.0f), Float8E4M3FNUZ(-NAN), Float8E4M3FNUZ(2.0f), Float8E4M3FNUZ(-NAN)};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaNFloat8E5M2) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<Float8E5M2> input = {Float8E5M2(1.0f), Float8E5M2(-NAN), Float8E5M2(2.0f), Float8E5M2(NAN)};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
+}
+
+TEST(IsNaNOpTest, IsNaN_Float8E5M2FNUZ) {
+  std::vector<int64_t> dims{2, 2};
+  std::initializer_list<Float8E5M2FNUZ> input = {Float8E5M2FNUZ(1.0f), Float8E5M2FNUZ(-NAN), Float8E5M2FNUZ(2.0f), Float8E5M2FNUZ(NAN)};
+  std::initializer_list<bool> output = {false, true, false, true};
+  run_is_nan_test(20, dims, input, output);
+}
+#endif
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index e721ccbcb45a9..3073dde9d8e4c 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -112,12 +112,13 @@ TEST_F(QnnCPUBackendTests, MatMulOp) {
 }
 
 // Test MatMul broadcasting
-// Note slight inaccuracy in CPU backend:
+// Failed randomly on Linux
+// Value of: expected_tensor.DataAsSpan<float>()
 // Expected: contains 896 values, where each value and its corresponding value in 16-byte object
-// <80-03 00-00 00-00 00-00 40-00 34-DD F7-01 00-00> are an almost-equal pair
-// Actual: 16-byte object <80-03 00-00 00-00 00-00 40-00 23-DD F7-01 00-00>,
-// where the value pair (73.68116, 73.680809) at index #80 don't match, which is -0.000350952 from 73.6812
-TEST_F(QnnCPUBackendTests, MatMulOp_Broadcast) {
+// <80-03 00-00 00-00 00-00 40-B8 53-08 CC-7F 00-00> are an almost-equal pair
+// Actual: 16-byte object <80-03 00-00 00-00 00-00 C0-B7 43-08 CC-7F 00-00>, where the value pair
+// (-5.19657087, 0) at index #29 don't match, which is 5.19657 from -5.19657
+TEST_F(QnnCPUBackendTests, DISABLED_MatMulOp_Broadcast) {
   // Create two matrices with element values in the range [-10.0, 10.0].
   std::vector<float> input_a = GetFloatDataInRange(-10.0f, 10.0f, 28 * 64);
   std::vector<float> input_b = GetFloatDataInRange(-10.0f, 10.0f, 64 * 32);
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 1d954fe4370ad..d8628c4288206 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -298,6 +298,20 @@ def test_set_providers_with_options(self):
             self.assertEqual(option["trt_engine_cache_path"], str(engine_cache_path))
             self.assertEqual(option["trt_force_sequential_engine_build"], "1")
 
+            from onnxruntime.capi import _pybind_state as C
+
+            session_options = C.get_default_session_options()
+
+            # TRT plugins registered as custom op domain should only be added once in session option regardless of number of session creation
+            sess1 = onnxrt.InferenceSession(
+                get_name("mul_1.onnx"), session_options, providers=["TensorrtExecutionProvider"]
+            )
+            sess2 = onnxrt.InferenceSession(
+                get_name("mul_1.onnx"), session_options, providers=["TensorrtExecutionProvider"]
+            )
+            self.assertIn("TensorrtExecutionProvider", sess1.get_providers())
+            self.assertIn("TensorrtExecutionProvider", sess2.get_providers())
+
             # We currently disable following test code since that not all test machines/GPUs have nvidia int8 capability
 
             """
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py b/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py
new file mode 100644
index 0000000000000..88432d75c653e
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import tempfile
+import unittest
+from importlib.util import find_spec
+from pathlib import Path
+from typing import Dict, Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import TensorProto, helper
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+
+from onnxruntime.quantization import quant_utils
+
+quant_maps = {
+    0: [
+        0.00000000,
+        5.208333333e-03,
+        0.66666667,
+        1.00000000,
+        0.33333333,
+        0.50000000,
+        0.16666667,
+        0.25000000,
+        -0.00000000,
+        -5.208333333e-03,
+        -0.66666667,
+        -1.00000000,
+        -0.33333333,
+        -0.50000000,
+        -0.16666667,
+        -0.25000000,
+    ],
+    1: [
+        -1.0,
+        -0.6961928009986877,
+        -0.5250730514526367,
+        -0.39491748809814453,
+        -0.28444138169288635,
+        -0.18477343022823334,
+        -0.09105003625154495,
+        0.0,
+        0.07958029955625534,
+        0.16093020141124725,
+        0.24611230194568634,
+        0.33791524171829224,
+        0.44070982933044434,
+        0.5626170039176941,
+        0.7229568362236023,
+        1.0,
+    ],
+}
+
+
+class TestOpMatMulBnb4(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="test_matmulbnb4.")
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
+    def fill_bnb4_data(self, shape: Tuple[int, int], quant_type: int) -> np.ndarray:
+        rows, cols = shape
+        line = np.zeros(shape)
+        line = line.reshape(-1)
+        quant_map = np.array(quant_maps[quant_type], dtype=np.float32)
+
+        v = 0
+        for i in range(line.shape[0]):
+            line[i] = quant_map[v]
+            v += 1
+            if v >= 16:
+                v = 0
+
+        # bnb quantization quantizes weight.T after flattening
+        line = line.reshape(cols, rows).transpose()
+        return line.reshape(shape)
+
+    def input_feeds(self, n: int, name2shape: Dict[str, Union[int, Tuple[int, ...]]]) -> TestDataFeeds:
+        input_data_list = []
+        for _i in range(n):
+            inputs = {}
+            for name, shape in name2shape.items():
+                inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
+            input_data_list.extend([inputs])
+        dr = TestDataFeeds(input_data_list)
+        return dr
+
+    def construct_model_matmul(self, output_model_path: str, quant_type: int) -> None:
+        #      (input)
+        #         |
+        #       MatMul
+        #         |
+        #      (output)
+        input_name = "input"
+        output_name = "output"
+        initializers = []
+
+        def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str):
+            weight_data = self.fill_bnb4_data(weight_shape, quant_type).astype(np.float32)
+            initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
+            return onnx.helper.make_node(
+                "MatMul",
+                [input_name, weight_name],
+                [output_name],
+            )
+
+        # for this to work (in_features * out_features) % block_size == 0
+        in_features = 52
+        out_features = 288
+        # make MatMul node
+        matmul_node = make_matmul(
+            input_name,
+            [in_features, out_features],
+            "linear1.weight",
+            output_name,
+        )
+
+        # make graph
+        input_tensor = helper.make_tensor_value_info(input_name, TensorProto.FLOAT, [-1, in_features])
+        output_tensor = helper.make_tensor_value_info(output_name, TensorProto.FLOAT, [-1, out_features])
+        graph_name = "matmul_bnb4_test"
+        graph = helper.make_graph(
+            [matmul_node],
+            graph_name,
+            [input_tensor],
+            [output_tensor],
+            initializer=initializers,
+        )
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        model.ir_version = 7  # use stable onnx ir version
+
+        onnx.save(model, output_model_path)
+
+    def quant_test(self, quant_type: int, block_size: int):
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath(f"matmul_fp32_{quant_type}.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, quant_type)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+
+        model_bnb4_path = str(
+            Path(self._tmp_model_dir.name).joinpath(f"MatMulBnb4_{quant_type}_{block_size}.onnx").absolute()
+        )
+
+        # Quantize fp32 model to bnb4 model
+        from onnxruntime.quantization import matmul_bnb4_quantizer
+
+        model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
+        quant = matmul_bnb4_quantizer.MatMulBnb4Quantizer(model, quant_type, block_size)
+        quant.process()
+        quant.model.save_model_to_file(model_bnb4_path, False)
+
+        quant_nodes = {"MatMulBnb4": 1}
+        check_op_type_count(self, model_bnb4_path, **quant_nodes)
+
+        data_reader.rewind()
+
+        try:
+            check_model_correctness(self, model_fp32_path, model_bnb4_path, data_reader.get_next())
+        except Exception as exception:
+            raise exception
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_bnb4"
+    )
+    def test_quantize_matmul_bnb4_fp4(self):
+        np.random.seed(13)
+        self.quant_test(0, 64)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_bnb4"
+    )
+    def test_quantize_matmul_bnb4_nf4(self):
+        np.random.seed(13)
+        self.quant_test(1, 64)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
new file mode 100644
index 0000000000000..9e9d05fae027d
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_bnb4.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import unittest
+from importlib.util import find_spec
+
+import numpy as np
+import numpy.typing as npt
+
+quant_enums = {"FP4": 0, "NF4": 1}
+
+
+def quantize_block_fp4(block: npt.ArrayLike):
+    # quantize a block of float32 values to uint8 by simulating a binary search using pivots
+    # could have used (block[:,None] - quant_map).argmin(axis=1) but there are some mismatches due to
+    # floating point precision
+    # block: 1-D array of normalized [-1,1] float32 values, len(block) % 2 == 0
+
+    # pivots to find the quantization index
+    # only half of the pivots are needed since the other half is symmetric
+    pivots = np.array(
+        [0.00260417, 0.0859375, 0.20833333, 0.29166667, 0.4166667, 0.583333, 0.8333333, 1], dtype=np.float32
+    )
+    # indices are not 0,1,2,3,4,5,6,7 because it is a floating point data type
+    pivot_indices = np.array([0, 1, 6, 7, 4, 5, 2, 3], dtype=np.uint8)
+
+    # signs of the block
+    signs = (block < 0).astype(np.uint8) * 8
+
+    # find the uint8 quantization index
+    # argmax finds the first occurrence of True
+    quant_indices = pivot_indices[(np.abs(block)[:, None] <= pivots).argmax(axis=1)] + signs
+
+    return np.bitwise_or(np.left_shift(quant_indices[::2], 4), quant_indices[1::2])
+
+
+def quantize_block_nf4(block: npt.ArrayLike):
+    pivots = np.array(
+        [
+            -0.8480964004993439,
+            -0.6106329262256622,
+            -0.4599952697753906,
+            -0.33967943489551544,
+            -0.23460740596055984,
+            -0.13791173323988914,
+            -0.045525018125772476,
+            0.03979014977812767,
+            0.1202552504837513,
+            0.2035212516784668,
+            0.2920137718319893,
+            0.3893125355243683,
+            0.5016634166240692,
+            0.6427869200706482,
+            0.8614784181118011,
+            1.0,
+        ],
+        dtype=np.float32,
+    )
+
+    quant_indices = (block[:, None] <= pivots).argmax(axis=1).astype(np.uint8)
+
+    return np.bitwise_or(np.left_shift(quant_indices[::2], 4), quant_indices[1::2])
+
+
+def quantize_blockwise_bnb4_ref(matrix_float: npt.ArrayLike, block_size: int, quant_type: str, target=None):
+    if len(matrix_float.shape) != 2:
+        raise ValueError("Current bnb4 block quantization only supports 2D tensors!")
+
+    numel = matrix_float.size
+    num_blocks = (numel + block_size - 1) // block_size
+    quantized_numel = (numel + 1) // 2
+
+    packed = np.zeros(quantized_numel, dtype=np.uint8)
+    absmax = np.zeros(num_blocks, dtype=matrix_float.dtype)
+
+    flattened_matrix_float = matrix_float.flatten()
+    for block_idx in range(num_blocks):
+        block_len = min(block_size, numel - block_idx * block_size)
+        block = np.float32(flattened_matrix_float[block_idx * block_size : block_idx * block_size + block_len])
+
+        block_absmax = np.max(np.abs(block))
+        reciprocal_absmax = 1.0 / block_absmax if block_absmax != 0 else 0.0
+        absmax[block_idx] = block_absmax
+
+        if block_len % 2 != 0:
+            block = np.append(block, 0.0)
+            block_len += 1
+
+        block *= reciprocal_absmax
+        start = block_idx * block_size // 2
+        end = start + block_len // 2
+        if quant_type == "FP4":
+            packed[start:end] = quantize_block_fp4(block)
+        else:
+            packed[start:end] = quantize_block_nf4(block)
+
+    return (packed, absmax)
+
+
+def quantize_blockwise_bnb4_target(matrix_float: npt.ArrayLike, block_size: int, quant_type: str):
+    if len(matrix_float.shape) != 2:
+        raise ValueError("Current int4 block quantization only supports 2D tensors!")
+    quant_type_enum = quant_enums[quant_type]
+
+    n, k = matrix_float.shape  # already transposed
+    numel = n * k
+    num_blocks = (numel + block_size - 1) // block_size
+    quantized_numel = (numel + 1) // 2
+
+    packed = np.zeros(quantized_numel, dtype="uint8")
+    absmax = np.zeros(num_blocks, dtype=matrix_float.dtype)
+    from onnxruntime.capi._pybind_state import quantize_matmul_bnb4
+
+    quantize_matmul_bnb4(packed, matrix_float, absmax, block_size, quant_type_enum, n, k)
+    return (packed, absmax)
+
+
+class TestQuantizeBlockwiseBnb4(unittest.TestCase):
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_bnb4"
+    )
+    def test_quantize_blockwise_bnb4(self):
+        for quant_type in ["FP4", "NF4"]:
+            for k, n in [(128, 128), (32, 128), (128, 32), (52, 128), (128, 52), (73, 123)]:
+                for block_size in [16, 32, 64, 128]:
+                    for type in [np.float32, np.float16]:
+                        matrix_float = np.random.uniform(-1, 1, (k, n)).astype(type)
+                        quant_value_ref, absmax_ref = quantize_blockwise_bnb4_ref(matrix_float, block_size, quant_type)
+                        quant_value, absmax = quantize_blockwise_bnb4_target(matrix_float, block_size, quant_type)
+                        assert np.allclose(quant_value_ref, quant_value)
+                        assert np.allclose(absmax_ref, absmax)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_parity_rotary_embedding.py b/onnxruntime/test/python/transformers/test_parity_rotary_embedding.py
new file mode 100644
index 0000000000000..b17ae5f69aff5
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_parity_rotary_embedding.py
@@ -0,0 +1,450 @@
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+
+# Notes
+# 1) The test cases in this file are for the following LLaMA-2 scenarios:
+# - Microsoft rotary embeddings with interleaved = True
+#   - Prompt generation
+#   - Token generation
+# - Hugging Face rotary embeddings (equal to Microsoft rotary embeddings with interleaved = False)
+#   - Prompt generation
+#   - Token generation
+#
+# 2) Shapes of position ids in ORT and `interleaved` for LLaMA-2 scenarios:
+# - Microsoft model: When shape of position ids == (1), interleaved = True
+# - Hugging Face model: When shape of position ids == (batch_size, sequence_length), interleaved = False
+
+
+import unittest
+from copy import deepcopy
+
+import numpy as np
+import torch
+import torch.nn as nn
+from onnx import TensorProto, helper
+
+import onnxruntime as ort
+
+
+class SampleInputConfig:
+    def __init__(
+        self,
+        batch_size=2,
+        sequence_length=8,
+        num_heads=4,
+        head_size=6,
+        max_sequence_length=16,
+    ):
+        self.batch_size = batch_size
+        self.sequence_length = sequence_length
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.hidden_size = self.num_heads * self.head_size
+        self.max_sequence_length = max_sequence_length
+
+
+# LLaMA Hugging Face model
+class LlamaHFRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device="cpu"):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+    def get_cos_sin_cache(self, seq_len=None, device=torch.device("cpu"), dtype=torch.float32):  # noqa: B008
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=device, dtype=dtype)
+
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=dtype),
+        )
+
+    def rotate_half(self, x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rope_bnsh(self, x, cos, sin, position_ids):
+        # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+        cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+        sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        x_embed = (x * cos) + (self.rotate_half(x) * sin)
+        return x_embed
+
+    def apply_rope_bsnh(self, x, cos, sin, position_ids):
+        # Two dimensions of cos and sin are always 1, so we can `squeeze` them.
+        cos = cos.squeeze()  # [seq_len, dim]
+        sin = sin.squeeze()  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        x_embed = (x * cos) + (self.rotate_half(x) * sin)
+        return x_embed
+
+    def forward(self, x, cos, sin, pos_ids, x_format="bnsh"):
+        if x_format == "bnsh":
+            return self.apply_rope_bnsh(x, cos, sin, pos_ids)
+        return self.apply_rope_bsnh(x, cos, sin, pos_ids)
+
+
+# LLaMA Microsoft model
+class LlamaMSRotaryEmbedding(nn.Module):
+    def __init__(self, hidden_size, num_heads, max_sequence_length):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.max_sequence_length = max_sequence_length
+
+    def get_cos_sin_cache(self, theta=10000.0, head_scale=1.0, device="cpu", dtype=torch.float32):
+        hidden_size = self.hidden_size
+        n_heads = self.num_heads
+        max_seq_len = self.max_sequence_length
+
+        # Precalculate rotary matrices for the sequence
+        # According to "Attention Is All You Need", theta_i = 10000 ^ (2 * (i - 1)/dim), i in [1, 2, ..., dim//2]
+        head_dim = head_scale * hidden_size / n_heads
+
+        pos = torch.arange(0, 2 * (head_dim // 2), step=2, device=device, dtype=dtype)
+        freqs = 1.0 / (theta ** (pos / head_dim))
+
+        idx = torch.arange(max_seq_len, device=freqs.device)
+        freqs = torch.outer(idx, freqs)
+
+        cos = torch.reshape(torch.cos(freqs), [1, max_seq_len, 1, -1])
+        sin = torch.reshape(torch.sin(freqs), [1, max_seq_len, 1, -1])
+        dtype = torch.get_default_dtype()
+
+        return cos.to(dtype), sin.to(dtype)
+
+    def rotate_tensor(
+        self,
+        x: torch.Tensor,  # BxSxNxH
+        cos: torch.Tensor,  # 1xSx1x(H/2)
+        sin: torch.Tensor,  # 1xSx1x(H/2)
+        pos: int,
+        interleaved: bool,
+    ):
+        # Dimension of x is [batch_size, seq_len, n_heads, head_dim]
+        rot_dim = 2 * cos.shape[3]
+
+        # Dolly requires partial rotation
+        x_rot = x[:, :, :, :rot_dim]
+
+        if interleaved:
+            x1 = x_rot[:, :, :, 0::2]
+            x2 = x_rot[:, :, :, 1::2]
+        else:
+            half = x_rot.shape[-1] // 2
+            x1 = x[:, :, :, 0:half]
+            x2 = x[:, :, :, half : 2 * half]
+
+        seq_len = x.shape[1]
+        cos_x = cos[:, pos : pos + seq_len, :, :]
+        sin_x = sin[:, pos : pos + seq_len, :, :]
+
+        # cos_x: (1, S, 1, H/2)
+        # sin_x: (1, S, 1, H/2)
+        # x1: (B, S, N, H/2)
+        # x2: (B, S, N, H/2)
+        real = cos_x * x1 - sin_x * x2
+        imag = sin_x * x1 + cos_x * x2
+
+        if interleaved:
+            x_rot[:, :, :, 0::2] = real
+            x_rot[:, :, :, 1::2] = imag
+        else:
+            x_rot = torch.cat((real, imag), dim=-1)
+
+        return torch.cat((x_rot, x[:, :, :, rot_dim:]), dim=-1)
+
+    def forward(self, x, cos, sin, pos, interleaved):
+        return self.rotate_tensor(x, cos, sin, pos, interleaved)
+
+
+class TestLlamaRotaryEmbedding(unittest.TestCase):
+    def setUp(self):
+        self.config = SampleInputConfig()
+        self.llama_hf = LlamaHFRotaryEmbedding(self.config.head_size, self.config.max_sequence_length)
+        self.llama_ms = LlamaMSRotaryEmbedding(
+            self.config.hidden_size, self.config.num_heads, self.config.max_sequence_length
+        )
+
+        seed = 2
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.set_printoptions(sci_mode=False)
+
+    def create_onnx_graph(self, x_shape, pos_shape, cos, sin, interleaved):
+        inputs = [
+            helper.make_tensor_value_info(
+                name="input",
+                elem_type=TensorProto.FLOAT,
+                shape=list(x_shape),
+            ),
+            helper.make_tensor_value_info(
+                name="position_ids",
+                elem_type=TensorProto.INT64,
+                shape=list(pos_shape),
+            ),
+        ]
+        outputs = [
+            helper.make_tensor_value_info(
+                name="output",
+                elem_type=TensorProto.FLOAT,
+                shape=list(x_shape),
+            ),
+        ]
+
+        initializers = [
+            helper.make_tensor(
+                name="cos_cache",
+                data_type=TensorProto.FLOAT,
+                dims=list(torch.squeeze(cos).shape),
+                vals=cos.flatten().tolist(),
+            ),
+            helper.make_tensor(
+                name="sin_cache",
+                data_type=TensorProto.FLOAT,
+                dims=list(torch.squeeze(sin).shape),
+                vals=sin.flatten().tolist(),
+            ),
+        ]
+        nodes = [
+            helper.make_node(
+                op_type="RotaryEmbedding",
+                inputs=["input", "position_ids", "cos_cache", "sin_cache"],
+                outputs=["output"],
+                interleaved=interleaved,
+                name="RotaryEmbedding_0",
+                domain="com.microsoft",
+            ),
+        ]
+
+        graph = helper.make_graph(
+            nodes=nodes,
+            name="RotaryEmbedding_Graph",
+            inputs=inputs,
+            outputs=outputs,
+            initializer=initializers,
+        )
+        opset_import = helper.make_opsetid(domain="com.microsoft", version=1)
+        model = helper.make_model(graph, opset_imports=[opset_import])
+        return model.SerializeToString()
+
+    def get_eps(self):
+        eps = ["CPUExecutionProvider", "CUDAExecutionProvider"]
+        return list(filter(lambda ep: ep in ort.get_available_providers(), eps))
+
+    def run_ort_ep_tests(self, onnx_graph, inputs_ort, expected_output_bsnh):
+        eps = self.get_eps()
+        for ep in eps:
+            sess = ort.InferenceSession(onnx_graph, providers=[ep])
+            output_ort = sess.run(None, inputs_ort)[0]
+            output_ort = output_ort.reshape(
+                (self.config.batch_size, inputs_ort["input"].shape[1], self.config.num_heads, self.config.head_size)
+            )
+
+            # Compare outputs as BxSxNxH
+            self.assertTrue(np.allclose(expected_output_bsnh, output_ort))
+
+    # apply_rope(x_bnsh) == apply_rope(x_bsnh).transpose(1,2)
+    def test_hf_bnsh_and_hf_bsnh(self):
+        x_bnsh = torch.randn(
+            self.config.batch_size, self.config.num_heads, self.config.sequence_length, self.config.head_size
+        )
+        cos_hf, sin_hf = self.llama_hf.get_cos_sin_cache(self.config.sequence_length)
+        pos_hf = torch.stack([torch.arange(0, self.config.sequence_length) for _ in range(self.config.batch_size)])
+
+        x_bnsh_after_rope = self.llama_hf(x_bnsh, cos_hf, sin_hf, pos_hf)  # output is BxNxSxH
+        x_bsnh_after_rope = self.llama_hf(
+            x_bnsh.transpose(1, 2), cos_hf.transpose(1, 2), sin_hf.transpose(1, 2), pos_hf, "bsnh"
+        )  # output is BxSxNxH
+
+        self.assertTrue(torch.allclose(x_bnsh_after_rope, x_bsnh_after_rope.transpose(1, 2)))
+
+    # HF rotary == MSFT rotary non-interleaved
+    def test_hf_rotary_and_msft_rotary_noninterleaved(self):
+        x_bnsh = torch.randn(
+            self.config.batch_size, self.config.num_heads, self.config.sequence_length, self.config.head_size
+        )
+        cos_hf, sin_hf = self.llama_hf.get_cos_sin_cache(self.config.sequence_length)
+        pos_hf = torch.stack([torch.arange(0, self.config.sequence_length) for _ in range(self.config.batch_size)])
+        output_hf = self.llama_hf(x_bnsh, cos_hf, sin_hf, pos_hf)  # output is BxNxSxH
+
+        x_bsnh = x_bnsh.transpose(1, 2)
+        x_bsd = deepcopy(x_bsnh)  # deepcopy to avoid changes made by self.llama_ms forward pass
+        cos_ms, sin_ms = self.llama_ms.get_cos_sin_cache()
+        pos_ms = 0
+        output_ms = (
+            self.llama_ms(x_bsd, cos_ms, sin_ms, pos_ms, interleaved=False).detach().cpu().numpy()  # output is BxSxNxH
+        )
+
+        # Compare caches as Mx(H/2)
+        self.assertTrue(
+            torch.allclose(self.llama_hf.cos_cached.squeeze()[:, : (self.config.head_size // 2)], cos_ms.squeeze())
+        )
+        self.assertTrue(
+            torch.allclose(self.llama_hf.sin_cached.squeeze()[:, : (self.config.head_size // 2)], sin_ms.squeeze())
+        )
+
+        # Compare outputs as BxSxNxH
+        self.assertTrue(np.allclose(output_hf.transpose(1, 2).detach().cpu().numpy(), output_ms))
+
+    # Prompt step, interleaved = true, pos ids shape = (1)
+    def test_msft_prompt_rotary_interleaved(self):
+        # Calculated this way to match the data in rotary_embedding_op_test.cc
+        x_bnsh = torch.randn(
+            self.config.batch_size, self.config.num_heads, self.config.sequence_length, self.config.head_size
+        )
+        x_bsnh = x_bnsh.transpose(1, 2)
+        x_bsd = deepcopy(x_bsnh)  # deepcopy to avoid changes made by self.llama_ms forward pass
+        cos_ms, sin_ms = self.llama_ms.get_cos_sin_cache()
+        pos_ms = 0
+        output_ms = self.llama_ms(deepcopy(x_bsnh), cos_ms, sin_ms, pos_ms, interleaved=True).detach().cpu().numpy()
+
+        x_bsd = x_bsd.reshape(self.config.batch_size, self.config.sequence_length, self.config.hidden_size)
+        pos_ms = torch.tensor([pos_ms])
+        onnx_graph = self.create_onnx_graph(x_bsd.shape, pos_ms.shape, cos_ms, sin_ms, interleaved=True)
+        inputs_ort = {
+            "input": x_bsd.detach().cpu().numpy(),
+            "position_ids": pos_ms.detach().cpu().numpy(),
+        }
+
+        # Compare inputs/outputs as BxSxNxH
+        self.assertTrue(np.allclose(x_bsnh.flatten(), x_bsd.flatten()))
+        self.run_ort_ep_tests(onnx_graph, inputs_ort, output_ms)
+
+    # Token generation step, interleaved = true, pos ids shape = (1)
+    def test_msft_token_rotary_interleaved(self):
+        # Calculated this way to match the data in rotary_embedding_op_test.cc
+        x_bnsh = torch.randn(
+            self.config.batch_size, self.config.num_heads, self.config.sequence_length, self.config.head_size
+        )
+        x_bsnh = x_bnsh.transpose(1, 2)
+        x_bsd = deepcopy(x_bsnh)  # deepcopy to avoid changes made by self.llama_ms forward pass
+        cos_ms, sin_ms = self.llama_ms.get_cos_sin_cache()
+        pos_ms = 2
+        output_ms = self.llama_ms(deepcopy(x_bsnh), cos_ms, sin_ms, pos_ms, interleaved=True).detach().cpu().numpy()
+
+        x_bsd = x_bsd.reshape(self.config.batch_size, self.config.sequence_length, self.config.hidden_size)
+        pos_ms = torch.tensor([pos_ms])
+        onnx_graph = self.create_onnx_graph(x_bsd.shape, pos_ms.shape, cos_ms, sin_ms, interleaved=True)
+        inputs_ort = {
+            "input": x_bsd.detach().cpu().numpy(),
+            "position_ids": pos_ms.detach().cpu().numpy(),
+        }
+
+        # Compare inputs/outputs as BxSxNxH
+        self.assertTrue(np.allclose(x_bsnh.flatten(), x_bsd.flatten()))
+        self.run_ort_ep_tests(onnx_graph, inputs_ort, output_ms)
+
+    # Prompt step, interleaved = false, pos ids shape = (batch_size, sequence_length)
+    def test_hf_prompt_rotary_batched_pos_ids(self):
+        x_bnsh = torch.randn(
+            self.config.batch_size, self.config.num_heads, self.config.sequence_length, self.config.head_size
+        )
+        cos_hf, sin_hf = self.llama_hf.get_cos_sin_cache(self.config.sequence_length)
+        pos_ids = torch.stack([torch.arange(0, self.config.sequence_length) for _ in range(self.config.batch_size)])
+        output_hf = self.llama_hf(x_bnsh, cos_hf, sin_hf, pos_ids)  # output is BxNxSxH
+
+        x_bsnh = x_bnsh.transpose(1, 2)
+        x_bsd = x_bsnh.reshape(self.config.batch_size, self.config.sequence_length, self.config.hidden_size)
+        cos_ms, sin_ms = self.llama_ms.get_cos_sin_cache()
+        onnx_graph = self.create_onnx_graph(x_bsd.shape, pos_ids.shape, cos_ms, sin_ms, interleaved=False)
+        inputs_ort = {
+            "input": x_bsd.detach().cpu().numpy(),
+            "position_ids": pos_ids.detach().cpu().numpy(),
+        }
+
+        self.run_ort_ep_tests(onnx_graph, inputs_ort, output_hf.transpose(1, 2).detach().cpu().numpy())
+
+    # Token generation step, interleaved = false, pos ids shape = (batch_size, sequence_length)
+    def test_hf_token_rotary_batched_pos_ids(self):
+        x_bnsh = torch.randn(self.config.batch_size, self.config.num_heads, 1, self.config.head_size)
+        cos_hf, sin_hf = self.llama_hf.get_cos_sin_cache(self.config.sequence_length)
+        pos_ids = torch.stack([torch.tensor([2]) for _ in range(self.config.batch_size)])
+        output_hf = self.llama_hf(x_bnsh, cos_hf, sin_hf, pos_ids)  # output is BxNxSxH
+
+        x_bsnh = x_bnsh.transpose(1, 2)
+        x_bsd = x_bsnh.reshape(self.config.batch_size, 1, self.config.hidden_size)
+        cos_ms, sin_ms = self.llama_ms.get_cos_sin_cache()
+        onnx_graph = self.create_onnx_graph(x_bsd.shape, pos_ids.shape, cos_ms, sin_ms, interleaved=False)
+        inputs_ort = {
+            "input": x_bsd.detach().cpu().numpy(),
+            "position_ids": pos_ids.detach().cpu().numpy(),
+        }
+
+        # Compare outputs as BxSxNxH
+        self.run_ort_ep_tests(onnx_graph, inputs_ort, output_hf.transpose(1, 2).detach().cpu().numpy())
+
+    # Bonus test: Prompt step, interleaved = false, pos ids shape = (1)
+    def test_hf_prompt_rotary_one_pos_id(self):
+        x_bnsh = torch.randn(
+            self.config.batch_size, self.config.num_heads, self.config.sequence_length, self.config.head_size
+        )
+        cos_hf, sin_hf = self.llama_hf.get_cos_sin_cache(self.config.sequence_length)
+        pos_hf = torch.stack([torch.arange(0, self.config.sequence_length) for _ in range(self.config.batch_size)])
+        output_hf = self.llama_hf(x_bnsh, cos_hf, sin_hf, pos_hf)  # output is BxNxSxH
+
+        x_bsnh = x_bnsh.transpose(1, 2)
+        x_bsd = x_bsnh.reshape(self.config.batch_size, self.config.sequence_length, self.config.hidden_size)
+        cos_ms, sin_ms = self.llama_ms.get_cos_sin_cache()
+        pos_ms = torch.tensor([0])
+        onnx_graph = self.create_onnx_graph(x_bsd.shape, pos_ms.shape, cos_ms, sin_ms, interleaved=False)
+        inputs_ort = {
+            "input": x_bsd.detach().cpu().numpy(),
+            "position_ids": pos_ms.detach().cpu().numpy(),
+        }
+
+        # Compare outputs as BxSxNxH
+        self.run_ort_ep_tests(onnx_graph, inputs_ort, output_hf.transpose(1, 2).detach().cpu().numpy())
+
+    # Bonus test: Token generation step, interleaved = false, pos ids shape = (1)
+    def test_hf_token_rotary_one_pos_id(self):
+        x_bnsh = torch.randn(self.config.batch_size, self.config.num_heads, 1, self.config.head_size)
+        cos_hf, sin_hf = self.llama_hf.get_cos_sin_cache(self.config.sequence_length)
+        pos_ids = torch.stack([torch.tensor([2]) for _ in range(self.config.batch_size)])
+        output_hf = self.llama_hf(x_bnsh, cos_hf, sin_hf, pos_ids)  # output is BxNxSxH
+
+        x_bsnh = x_bnsh.transpose(1, 2)
+        x_bsd = x_bsnh.reshape(self.config.batch_size, 1, self.config.hidden_size)
+        cos_ms, sin_ms = self.llama_ms.get_cos_sin_cache()
+        pos_ms = torch.tensor([2])
+        onnx_graph = self.create_onnx_graph(x_bsd.shape, pos_ms.shape, cos_ms, sin_ms, interleaved=False)
+        inputs_ort = {
+            "input": x_bsd.detach().cpu().numpy(),
+            "position_ids": pos_ms.detach().cpu().numpy(),
+        }
+
+        # Compare outputs as BxSxNxH
+        self.run_ort_ep_tests(onnx_graph, inputs_ort, output_hf.transpose(1, 2).detach().cpu().numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_rotary_embedding_fusion.py b/onnxruntime/test/python/transformers/test_rotary_embedding_fusion.py
new file mode 100644
index 0000000000000..7bca48c29019e
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_rotary_embedding_fusion.py
@@ -0,0 +1,447 @@
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import os
+import sys
+import unittest
+from typing import List
+
+import numpy as np
+import onnx
+from onnx import TensorProto, helper
+from parity_utilities import find_transformers_source
+
+if find_transformers_source():
+    from fusion_options import FusionOptions
+    from onnx_model import OnnxModel
+    from optimizer import optimize_model
+else:
+    from onnxruntime.transformers.fusion_options import FusionOptions
+    from onnxruntime.transformers.onnx_model import OnnxModel
+    from onnxruntime.transformers.optimizer import optimize_model
+
+
+def float_tensor(name: str, shape: List[int], random=False):
+    low = 0.0
+    high = 1.0
+    total_elements = 1
+    for x in shape:
+        total_elements *= x
+    weights = [np.random.uniform(low, high) for _ in range(total_elements)] if random else [1.0] * total_elements
+    return helper.make_tensor(name, TensorProto.FLOAT, shape, weights)
+
+
+class TestRotaryEmbeddingFusion(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 2
+        self.sequence_length = 8
+        self.num_heads = 4
+        self.head_size = 6
+        self.hidden_size = self.num_heads * self.head_size
+
+        self.past_sequence_length = 2
+        self.max_sequence_length = 12
+
+    def verify_fusion(self, expected_model_path, original_model_path):
+        expected_model = OnnxModel(onnx.load(expected_model_path))
+        expected_model.topological_sort(is_deterministic=True)
+
+        options = FusionOptions("gpt2")
+        optimized_model = optimize_model(original_model_path, optimization_options=options, opt_level=0)
+        optimized_model.topological_sort(is_deterministic=True)
+
+        self.assertTrue(str(expected_model.model.graph), str(optimized_model.model.graph))
+
+    def create_initializers(self):
+        initializers = [
+            float_tensor("cos_cache", [self.max_sequence_length, self.head_size]),
+            float_tensor("sin_cache", [self.max_sequence_length, self.head_size]),
+            helper.make_tensor(
+                "pos_ids_new_shape",
+                TensorProto.FLOAT,
+                [2],
+                np.array([self.batch_size, self.sequence_length], dtype=np.int64),
+            ),
+            helper.make_tensor("zero", TensorProto.FLOAT, [1], np.array([0], dtype=np.int64)),
+            helper.make_tensor("one", TensorProto.FLOAT, [1], np.array([1], dtype=np.int64)),
+            helper.make_tensor("two", TensorProto.FLOAT, [1], np.array([2], dtype=np.int64)),
+            helper.make_tensor("three", TensorProto.FLOAT, [1], np.array([3], dtype=np.int64)),
+            helper.make_tensor("int_max", TensorProto.FLOAT, [1], np.array([sys.maxsize], dtype=np.int64)),
+        ]
+        return initializers
+
+    def create_inputs_and_outputs(self, model_type: str = ""):
+        inputs = [
+            helper.make_tensor_value_info(
+                "input_0",
+                TensorProto.FLOAT,
+                [self.batch_size, self.sequence_length, self.num_heads, self.head_size],
+            ),
+            helper.make_tensor_value_info("position_ids", TensorProto.INT64, [self.batch_size, self.sequence_length]),
+        ]
+        if model_type in {"past", "merged"}:
+            # Input will be removed in fused model since it's not used in RotaryEmbedding.
+            # We create this input so that we can check the `past_seq_len` path during
+            # RotaryEmbedding fusion.
+            inputs.append(
+                helper.make_tensor_value_info(
+                    "past_key",
+                    TensorProto.FLOAT,
+                    [self.batch_size, self.num_heads, self.past_sequence_length, self.head_size],
+                )
+            )
+        # Dummy input to test nodes for `curr_seq_len` path
+        if model_type != "":
+            inputs.append(
+                helper.make_tensor_value_info(
+                    "curr_key",
+                    TensorProto.FLOAT,
+                    [self.batch_size, self.sequence_length, self.num_heads, self.head_size],
+                )
+            )
+        outputs = [
+            helper.make_tensor_value_info(
+                "output_0",
+                TensorProto.FLOAT,
+                [self.batch_size, self.num_heads, self.sequence_length, self.head_size],
+            )
+        ]
+        if model_type in {"merged"}:
+            # Dummy output to test that nodes for `past_seq_len` path are not removed for merged model
+            outputs.append(helper.make_tensor_value_info("past_seq_len_plus_zero", TensorProto.FLOAT, [1]))
+        return inputs, outputs
+
+    def create_fused_model(self, interleaved: bool, initializers: List[TensorProto]):
+        inputs, outputs = self.create_inputs_and_outputs()
+
+        rope_node = helper.make_node(
+            "RotaryEmbedding",
+            inputs=[inputs[0].name, inputs[1].name, initializers[0].name, initializers[1].name],
+            outputs=[outputs[0].name],
+            name="RotaryEmbedding_0",
+            interleaved=int(interleaved),
+        )
+
+        graph = helper.make_graph(
+            nodes=[rope_node],
+            name="RotaryEmbedding_Graph",
+            inputs=inputs,
+            outputs=outputs,
+            initializer=initializers,
+        )
+        opset_import = helper.make_opsetid(domain="com.microsoft", version=1)
+        model = helper.make_model(graph, opset_imports=[opset_import])
+        return model
+
+    def create_cache_path(self, model_type: str, use_redundant_squeeze_ops: bool):
+        # Create position ids path
+        reshape_node = helper.make_node(
+            "Reshape",
+            inputs=["position_ids", "pos_ids_new_shape"],
+            outputs=["pos_ids_reshaped"],
+            name="Reshape_0",
+        )
+        pos_ids_nodes = [reshape_node]
+
+        # Create cos path
+        cos_init_unsqueeze_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["new_seq_len", "zero"],
+            outputs=["cos_unsqueeze"],
+            name="Unsqueeze_2",
+        )
+        cos_slice_node = helper.make_node(
+            "Slice",
+            inputs=["cos_cache", "zero", "cos_unsqueeze", "two", "one"],
+            outputs=["cos_sliced"],
+            name="Slice_2",
+        )
+        cos_nodes = [cos_init_unsqueeze_node, cos_slice_node]
+
+        if use_redundant_squeeze_ops:
+            # These two nodes are eliminated by this transformers PR: https://github.com/huggingface/transformers/pull/26162
+            cos_squeeze_1_node = helper.make_node(
+                "Squeeze",
+                inputs=["cos_sliced", "zero"],
+                outputs=["cos_squeeze_1"],
+                name="Squeeze_0",
+            )
+            cos_squeeze_2_node = helper.make_node(
+                "Squeeze",
+                inputs=["cos_squeeze_1", "zero"],
+                outputs=["cos_squeeze_2"],
+                name="Squeeze_1",
+            )
+            cos_nodes.extend([cos_squeeze_1_node, cos_squeeze_2_node])
+
+        cos_gather_node = helper.make_node(
+            "Gather",
+            inputs=["cos_squeeze_2" if use_redundant_squeeze_ops else "cos_sliced", "pos_ids_reshaped"],
+            outputs=["cos_indexed"],
+            name="Gather_1",
+        )
+        cos_end_unsqueeze_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["cos_indexed", "one"],
+            outputs=["cos"],
+            name="Unsqueeze_3",
+        )
+        cos_nodes.extend([cos_gather_node, cos_end_unsqueeze_node])
+
+        # Create sin path
+        sin_init_unsqueeze_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["new_seq_len", "zero"],
+            outputs=["sin_unsqueeze"],
+            name="Unsqueeze_4",
+        )
+        sin_slice_node = helper.make_node(
+            "Slice",
+            inputs=["sin_cache", "zero", "sin_unsqueeze", "two", "one"],
+            outputs=["sin_sliced"],
+            name="Slice_3",
+        )
+        sin_nodes = [sin_init_unsqueeze_node, sin_slice_node]
+
+        if use_redundant_squeeze_ops:
+            sin_squeeze_1_node = helper.make_node(
+                "Squeeze",
+                inputs=["sin_sliced", "zero"],
+                outputs=["sin_squeeze_1"],
+                name="Squeeze_2",
+            )
+            sin_squeeze_2_node = helper.make_node(
+                "Squeeze",
+                inputs=["sin_squeeze_1", "zero"],
+                outputs=["sin_squeeze_2"],
+                name="Squeeze_3",
+            )
+            sin_nodes.extend([sin_squeeze_1_node, sin_squeeze_2_node])
+
+        sin_gather_node = helper.make_node(
+            "Gather",
+            inputs=["sin_squeeze_2" if use_redundant_squeeze_ops else "sin_sliced", "pos_ids_reshaped"],
+            outputs=["sin_indexed"],
+            name="Gather_2",
+        )
+        sin_end_unsqueeze_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["sin_indexed", "one"],
+            outputs=["sin"],
+            name="Unsqueeze_5",
+        )
+        sin_nodes.extend([sin_gather_node, sin_end_unsqueeze_node])
+
+        # Create beginning nodes before cos and sin paths
+
+        # Create curr seq len path
+        curr_transpose_node = helper.make_node(
+            "Transpose",
+            inputs=["curr_key"],
+            outputs=["curr_key_transposed"],
+            name="Transpose_curr",
+            perm=[0, 2, 1, 3],
+        )
+        curr_shape_node = helper.make_node(
+            "Shape",
+            inputs=["curr_key_transposed"],
+            outputs=["curr_shape"],
+            name="Shape_curr",
+        )
+        curr_gather_node = helper.make_node(
+            "Gather",
+            inputs=["curr_shape", "two"],
+            outputs=["curr_seq_len" if model_type in {"past", "merged"} else "new_seq_len"],
+            name="Gather_curr",
+        )
+        beginning_nodes = [curr_transpose_node, curr_shape_node, curr_gather_node]
+
+        if model_type in {"past", "merged"}:
+            # Create past seq len path
+            past_shape_node = helper.make_node(
+                "Shape",
+                inputs=["past_key"],
+                outputs=["past_shape"],
+                name="Shape_past",
+            )
+            past_gather_node = helper.make_node(
+                "Gather",
+                inputs=["past_shape", "two"],
+                outputs=["past_seq_len"],
+                name="Gather_past",
+            )
+            add_node = helper.make_node(
+                "Add",
+                inputs=["curr_seq_len", "past_seq_len"],
+                outputs=["new_seq_len"],
+                name="Add_1",
+            )
+            beginning_nodes.extend([past_shape_node, past_gather_node, add_node])
+
+        if model_type == "merged":
+            dummy_node = helper.make_node(
+                "Add",
+                inputs=["past_seq_len", "zero"],
+                outputs=["past_seq_len_plus_zero"],
+                name="Add_dummy_node",
+            )
+            beginning_nodes.append(dummy_node)
+
+        return pos_ids_nodes + cos_nodes + sin_nodes + beginning_nodes
+
+    def create_apply_rope_path(self):
+        start_node = helper.make_node(
+            "Transpose",
+            inputs=["input_0"],
+            outputs=["x"],
+            name="Transpose_0",
+            perm=[0, 2, 1, 3],
+        )
+
+        # Calculate x_half_shape
+        shape_node = helper.make_node(
+            "Shape",
+            inputs=["x"],
+            outputs=["x_shape"],
+            name="Shape_0",
+        )
+        gather_node = helper.make_node(
+            "Gather",
+            inputs=["x_shape", "three"],
+            outputs=["x_last_idx_shape"],
+            name="Gather_0",
+            axis=0,
+        )
+        div_node = helper.make_node(
+            "Div",
+            inputs=["x_last_idx_shape", "two"],
+            outputs=["x_half_shape"],
+            name="Div_0",
+        )
+        unsqueeze_0_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["x_half_shape", "zero"],
+            outputs=["x_half_shape_0"],
+            name="Unsqueeze_0",
+        )
+        unsqueeze_1_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["x_half_shape", "zero"],
+            outputs=["x_half_shape_1"],
+            name="Unsqueeze_1",
+        )
+        x_half_shape_nodes = [shape_node, gather_node, div_node, unsqueeze_0_node, unsqueeze_1_node]
+
+        # Calculate rotate_half
+        x1_node = helper.make_node(
+            "Slice",
+            inputs=["x", "zero", "x_half_shape_0", "three", "one"],
+            outputs=["x1"],
+            name="Slice_0",
+        )
+        x2_node = helper.make_node(
+            "Slice",
+            inputs=["x", "x_half_shape_1", "int_max", "three", "one"],
+            outputs=["x2"],
+            name="Slice_1",
+        )
+        neg_node = helper.make_node(
+            "Neg",
+            inputs=["x2"],
+            outputs=["x2_neg"],
+            name="Neg_0",
+        )
+        x_rotate_half_node = helper.make_node(
+            "Concat",
+            inputs=["x2_neg", "x1"],
+            outputs=["x_rotate_half"],
+            name="Concat_0",
+            axis=-1,
+        )
+        rotate_half_nodes = [x1_node, x2_node, neg_node, x_rotate_half_node]
+
+        # Calculate x_embed
+        x_cos_node = helper.make_node(
+            "Mul",
+            inputs=["x", "cos"],
+            outputs=["x_cos"],
+            name="Mul_0",
+        )
+        x_sin_node = helper.make_node(
+            "Mul",
+            inputs=["x_rotate_half", "sin"],
+            outputs=["x_rotate_half_sin"],
+            name="Mul_1",
+        )
+        end_node = helper.make_node(
+            "Add",
+            inputs=["x_cos", "x_rotate_half_sin"],
+            outputs=["output_0"],
+            name="Add_0",
+        )
+        x_embed_nodes = [start_node, x_cos_node, x_sin_node, end_node]
+
+        return x_half_shape_nodes + rotate_half_nodes + x_embed_nodes
+
+    def create_test_model(self, model_type: str, use_redundant_squeeze_ops: bool, initializers: List[TensorProto]):
+        apply_rope_nodes = self.create_apply_rope_path()
+        cache_nodes = self.create_cache_path(model_type, use_redundant_squeeze_ops)
+        inputs, outputs = self.create_inputs_and_outputs(model_type)
+
+        graph = helper.make_graph(
+            nodes=apply_rope_nodes + cache_nodes,
+            name="RotaryEmbedding_Graph",
+            inputs=inputs,
+            outputs=outputs,
+            initializer=initializers,
+        )
+        opset_import = helper.make_opsetid(domain="ai.onnx", version=13)
+        model = helper.make_model(graph, opset_imports=[opset_import])
+        return model
+
+    def check_models(self, interleaved: bool, model_type: str):
+        initializers = self.create_initializers()
+
+        expected_model_filename = "expected_model.onnx"
+        expected_model = self.create_fused_model(interleaved, initializers)
+        onnx.save(expected_model, expected_model_filename)
+
+        original_model_filename = "original_model.onnx"
+        use_redundant_squeeze_ops = True
+        original_model = self.create_test_model(model_type, use_redundant_squeeze_ops, initializers)
+        onnx.save(original_model, original_model_filename)
+
+        self.verify_fusion(expected_model_filename, original_model_filename)
+        os.remove(original_model_filename)
+
+        use_redundant_squeeze_ops = False
+        original_model = self.create_test_model(model_type, use_redundant_squeeze_ops, initializers)
+        onnx.save(original_model, original_model_filename)
+
+        self.verify_fusion(expected_model_filename, original_model_filename)
+        os.remove(expected_model_filename)
+        os.remove(original_model_filename)
+
+    # Hugging Face's `decoder_model.onnx`
+    def test_hf_decoder_model(self):
+        interleaved = False  # HF model does not use interleaving
+        model_type = "no_past"
+        self.check_models(interleaved, model_type)
+
+    # Hugging Face's `decoder_with_past_model.onnx`
+    def test_hf_decoder_with_past_model(self):
+        interleaved = False  # HF model does not use interleaving
+        model_type = "past"
+        self.check_models(interleaved, model_type)
+
+    # Hugging Face's `decoder_merged.onnx`
+    def test_hf_decoder_merged_model(self):
+        interleaved = False  # HF model does not use interleaving
+        model_type = "merged"
+        self.check_models(interleaved, model_type)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_rotary_mha_fusion.py b/onnxruntime/test/python/transformers/test_rotary_mha_fusion.py
new file mode 100644
index 0000000000000..fedba2a25dfc2
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_rotary_mha_fusion.py
@@ -0,0 +1,795 @@
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import os
+import sys
+import unittest
+from typing import List
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper
+from parity_utilities import find_transformers_source
+
+if find_transformers_source():
+    from fusion_options import FusionOptions
+    from onnx_model import OnnxModel
+    from optimizer import optimize_model
+else:
+    from onnxruntime.transformers.fusion_options import FusionOptions
+    from onnxruntime.transformers.onnx_model import OnnxModel
+    from onnxruntime.transformers.optimizer import optimize_model
+
+
+def float_tensor(name: str, shape: List[int], random=False):
+    low = 0.0
+    high = 1.0
+    total_elements = 1
+    for x in shape:
+        total_elements *= x
+    weights = [np.random.uniform(low, high) for _ in range(total_elements)] if random else [1.0] * total_elements
+    return helper.make_tensor(name, TensorProto.FLOAT, shape, weights)
+
+
+class TestRotaryAttentionFusion(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 2
+        self.sequence_length = 8
+        self.num_heads = 4
+        self.head_size = 6
+        self.hidden_size = self.num_heads * self.head_size
+
+        self.past_sequence_length = 2
+        self.max_sequence_length = 12
+
+    def verify_fusion(self, expected_model_path, original_model_path):
+        expected_model = OnnxModel(onnx.load(expected_model_path))
+        expected_model.topological_sort(is_deterministic=True)
+
+        model_type = "gpt2"
+        options = FusionOptions(model_type)
+        optimized_model = optimize_model(
+            original_model_path,
+            model_type,
+            self.num_heads,
+            self.hidden_size,
+            optimization_options=options,
+            opt_level=0,
+        )
+        optimized_model.topological_sort(is_deterministic=True)
+
+        self.assertTrue(str(expected_model.model.graph), str(optimized_model.model.graph))
+
+    def create_initializers(self, fused_model: bool = False):
+        initializers = [
+            float_tensor("cos_cache", [self.max_sequence_length, self.head_size // 2]),
+            float_tensor("sin_cache", [self.max_sequence_length, self.head_size // 2]),
+            float_tensor("q_weight", [self.hidden_size, self.hidden_size]),
+            float_tensor("k_weight", [self.hidden_size, self.hidden_size]),
+            float_tensor("v_weight", [self.hidden_size, self.hidden_size]),
+            float_tensor("o_weight", [self.hidden_size, self.hidden_size]),
+            helper.make_tensor(
+                "sqrt_head_size", TensorProto.FLOAT, [1], np.array([np.sqrt(self.head_size)], dtype=np.float32)
+            ),
+            helper.make_tensor("neg_int_max", TensorProto.FLOAT, [1], np.array([-sys.maxsize - 1], dtype=np.int64)),
+            helper.make_tensor("num_heads", TensorProto.FLOAT, [1], np.array([self.num_heads], dtype=np.float32)),
+            helper.make_tensor("head_size", TensorProto.FLOAT, [1], np.array([self.head_size], dtype=np.float32)),
+            helper.make_tensor("hidden_size", TensorProto.FLOAT, [1], np.array([self.hidden_size], dtype=np.float32)),
+            helper.make_tensor("zero", TensorProto.FLOAT, [1], np.array([0], dtype=np.int64)),
+            helper.make_tensor("one", TensorProto.FLOAT, [1], np.array([1], dtype=np.int64)),
+            helper.make_tensor("two", TensorProto.FLOAT, [1], np.array([2], dtype=np.int64)),
+            helper.make_tensor("three", TensorProto.FLOAT, [1], np.array([3], dtype=np.int64)),
+        ]
+        return initializers
+
+    def create_inputs_and_outputs(self, model_type: str):
+        attn_mask_size = [self.batch_size, self.sequence_length]
+        if model_type == "llama2_msft":
+            attn_mask_size.append(self.sequence_length)
+
+        inputs = [
+            helper.make_tensor_value_info(
+                "input_0", TensorProto.FLOAT, [self.batch_size, self.sequence_length, self.hidden_size]
+            ),
+            helper.make_tensor_value_info("position_ids", TensorProto.INT64, [self.batch_size, self.sequence_length]),
+            helper.make_tensor_value_info("attn_mask", TensorProto.INT64, attn_mask_size),
+        ]
+        if model_type in {"past", "merged", "llama2_msft"}:
+            inputs.extend(
+                [
+                    helper.make_tensor_value_info(
+                        "past_key",
+                        TensorProto.FLOAT,
+                        [self.batch_size, self.num_heads, self.past_sequence_length, self.head_size],
+                    ),
+                    helper.make_tensor_value_info(
+                        "past_value",
+                        TensorProto.FLOAT,
+                        [self.batch_size, self.num_heads, self.past_sequence_length, self.head_size],
+                    ),
+                ]
+            )
+        outputs = [
+            helper.make_tensor_value_info(
+                "output_0", TensorProto.FLOAT, [self.batch_size, self.sequence_length, self.hidden_size]
+            ),
+            helper.make_tensor_value_info(
+                "present_key",
+                TensorProto.FLOAT,
+                [self.batch_size, self.num_heads, self.past_sequence_length + 1, self.head_size],
+            ),
+            helper.make_tensor_value_info(
+                "present_value",
+                TensorProto.FLOAT,
+                [self.batch_size, self.num_heads, self.past_sequence_length + 1, self.head_size],
+            ),
+        ]
+        return inputs, outputs
+
+    def create_matmul_nodes(self, is_fused: bool, model_type: str):
+        q_matmul_node = helper.make_node(
+            "MatMul",
+            inputs=["input_0", "q_weight"],
+            outputs=["q_out" if is_fused or model_type == "llama2_msft" else "q_matmul_out"],
+            name="Q_MatMul",
+        )
+
+        k_matmul_node = helper.make_node(
+            "MatMul",
+            inputs=["input_0", "k_weight"],
+            outputs=["k_out" if is_fused or model_type == "llama2_msft" else "k_matmul_out"],
+            name="K_MatMul",
+        )
+
+        v_matmul_node = helper.make_node(
+            "MatMul",
+            inputs=["input_0", "v_weight"],
+            outputs=["v_out"],
+            name="V_MatMul",
+        )
+
+        return [q_matmul_node, k_matmul_node, v_matmul_node]
+
+    def create_rotary_embeddings(
+        self,
+        is_fused: bool,
+        model_type: str,
+        interleaved: bool,
+        inputs: List[TensorProto],
+        initializers: List[TensorProto],
+    ):
+        def get_first_rope_input(node_type: str):
+            if is_fused or model_type == "llama2_msft":
+                # q_out/k_out
+                return f"{node_type}_out"
+            if model_type in {"no_past", "past", "merged"}:
+                if node_type == "k":
+                    return "k_before_rope"
+                return "q_before_rope"
+            return ""
+
+        def get_first_rope_output(node_type: str):
+            if is_fused or model_type in {"llama2_msft", "past", "merged"}:
+                if node_type == "q":
+                    return "q_rope"
+                return "k_rope"
+            if model_type in {"no_past"}:
+                if node_type == "k":
+                    return "present_key"
+                return "q_rope"
+            return ""
+
+        q_rope_node = helper.make_node(
+            "RotaryEmbedding",
+            inputs=[get_first_rope_input("q"), inputs[1].name, initializers[0].name, initializers[1].name],
+            outputs=[get_first_rope_output("q")],
+            name="Q_RotaryEmbedding",
+            interleaved=int(interleaved),
+        )
+
+        k_rope_node = helper.make_node(
+            "RotaryEmbedding",
+            inputs=[get_first_rope_input("k"), inputs[1].name, initializers[0].name, initializers[1].name],
+            outputs=[get_first_rope_output("k")],
+            name="K_RotaryEmbedding",
+            interleaved=int(interleaved),
+        )
+
+        return [q_rope_node, k_rope_node]
+
+    def create_q_path(self, model_type: str):
+        if model_type == "llama2_msft":
+            transpose_q_node = helper.make_node(
+                "Transpose",
+                inputs=["q_rope"],
+                outputs=["q_transposed"],
+                name="Transpose_q",
+                perm=[0, 2, 1, 3],
+            )
+            reshape_q_node = helper.make_node(
+                "Reshape",
+                inputs=["q_transposed", "concat_q_extra_out"],
+                outputs=["q"],
+                name="Reshape_q",
+            )
+            return [transpose_q_node, reshape_q_node]
+
+        reshape_q_node = helper.make_node(
+            "Reshape",
+            inputs=["q_matmul_out", "concat_q_extra_out"],
+            outputs=["q_reshaped"],
+            name="Reshape_q",
+        )
+        transpose_q_node = helper.make_node(
+            "Transpose",
+            inputs=["q_reshaped"],
+            outputs=["q_before_rope"],
+            name="Transpose_q",
+        )
+        return [reshape_q_node, transpose_q_node]
+
+    def create_k_path_llama2_msft(self):
+        # Create k cache slicing path
+        k_cache_unsqueeze_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["position_ids", "zero"],
+            outputs=["k_pos_id"],
+        )
+        k_cache_slice_node = helper.make_node(
+            "Slice",
+            inputs=["past_key", "zero", "k_pos_id", "two", "one"],
+            outputs=["k_cache_sliced"],
+        )
+        # Create k path
+        transpose_k_1_node = helper.make_node(
+            "Transpose",
+            inputs=["k_rope"],
+            outputs=["k_rope_transposed"],
+            name="Transpose_k_1",
+            perm=[0, 2, 1, 3],
+        )
+        concat_k_node = helper.make_node(
+            "Concat",
+            inputs=["k_cache_sliced", "k_rope_transposed"],
+            outputs=["present_key"],
+            name="Concat_k",
+            axis=2,
+        )
+        transpose_k_2_node = helper.make_node(
+            "Transpose",
+            inputs=["present_key"],
+            outputs=["present_key_transposed"],
+            name="Transpose_k_2",
+            perm=[0, 2, 3, 1],
+        )
+        reshape_k_node = helper.make_node(
+            "Reshape",
+            inputs=["present_key_transposed", "concat_k_extra_out"],
+            outputs=["k"],
+            name="Reshape_k",
+        )
+        return [
+            k_cache_unsqueeze_node,
+            k_cache_slice_node,
+            transpose_k_1_node,
+            concat_k_node,
+            transpose_k_2_node,
+            reshape_k_node,
+        ]
+
+    def create_k_path_hf(self, model_type: str):
+        reshape_k_node = helper.make_node(
+            "Reshape",
+            inputs=["k_matmul_out", "concat_k_extra_out"],
+            outputs=["k_reshaped"],
+            name="Reshape_k",
+        )
+        transpose_k_1_node = helper.make_node(
+            "Transpose",
+            inputs=["k_reshaped"],
+            outputs=["k_before_rope"],
+            name="Transpose_k_1",
+            perm=[0, 2, 1, 3],
+        )
+        k_nodes = [reshape_k_node, transpose_k_1_node]
+
+        if model_type in {"past", "merged"}:
+            concat_k_node = helper.make_node(
+                "Concat",
+                inputs=["past_key", "k_rope"],
+                outputs=["present_key"],
+                axis=2,
+            )
+            k_nodes.append(concat_k_node)
+
+        transpose_k_2_node = helper.make_node(
+            "Transpose",
+            inputs=["present_key"],
+            outputs=["k"],
+            name="Transpose_k_2",
+            perm=[0, 1, 3, 2],
+        )
+        return k_nodes + [transpose_k_2_node]  # noqa: RUF005
+
+    def create_k_path(self, model_type: str):
+        if model_type == "llama2_msft":
+            return self.create_k_path_llama2_msft()
+        return self.create_k_path_hf(model_type)
+
+    def create_attn_mask_path_llama2_msft(self):
+        x_shape_node = helper.make_node(
+            "Shape",
+            inputs=["input_0"],
+            outputs=["input_0_shape"],
+            name="Shape_input",
+        )
+        x_get_seq_len_node = helper.make_node(
+            "Gather",
+            inputs=["input_0_shape", "one"],
+            outputs=["input_0_seq_len"],
+            name="Gather_input",
+            axis=0,
+        )
+        x_new_seq_len_node = helper.make_node(
+            "Add",
+            inputs=["position_ids", "input_0_seq_len"],
+            outputs=["new_seq_len"],
+            name="Add_mask",
+        )
+        unsqueeze_0_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["position_ids", "zero"],
+            outputs=["unsqueeze_mask_0_out"],
+            name="Unsqueeze_mask_0",
+        )
+        unsqueeze_1_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["new_seq_len", "zero"],
+            outputs=["unsqueeze_mask_1_out"],
+            name="Unsqueeze_mask_1",
+        )
+        unsqueeze_2_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["new_seq_len", "zero"],
+            outputs=["unsqueeze_mask_2_out"],
+            name="Unsqueeze_mask_2",
+        )
+        slice_mask_1_node = helper.make_node(
+            "Slice",
+            inputs=["attn_mask", "unsqueeze_mask_0_out", "unsqueeze_mask_1_out", "one", "one"],
+            outputs=["slice_mask_1_out"],
+            name="Slice_mask_1",
+        )
+        slice_mask_2_node = helper.make_node(
+            "Slice",
+            inputs=["slice_mask_1_out", "zero", "unsqueeze_mask_2_out", "two", "one"],
+            outputs=["slice_mask_2_out"],
+            name="Slice_mask_2",
+        )
+        concat_mask_node = helper.make_node(
+            "Concat",
+            inputs=["slice_mask_2_out" for _ in range(self.num_heads)],
+            outputs=["attn_mask_out"],
+            name="Concat_mask",
+            axis=0,
+        )
+        return [
+            x_shape_node,
+            x_get_seq_len_node,
+            x_new_seq_len_node,
+            unsqueeze_0_node,
+            unsqueeze_1_node,
+            unsqueeze_2_node,
+            slice_mask_1_node,
+            slice_mask_2_node,
+            concat_mask_node,
+        ]
+
+    def create_attn_mask_path_hf(self, model_type: str):
+        unsqueeze_1_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["attn_mask", "one"],
+            outputs=["unsqueeze_1_mask_out"],
+            name="Unsqueeze_1_mask",
+        )
+        unsqueeze_2_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["unsqueeze_1_mask_out", "two"],
+            outputs=["unsqueeze_2_mask_out"],
+            name="Unsqueeze_2_mask",
+        )
+        expand_node = helper.make_node(
+            "Expand",
+            inputs=["unsqueeze_2_mask_out", "zero"],
+            outputs=["expand_out"],
+            name="Expand_mask",
+        )
+        cast_node = helper.make_node(
+            "Cast",
+            inputs=["expand_out"],
+            outputs=["cast_out"],
+            name="Cast_mask",
+            to=TensorProto.FLOAT,
+        )
+        sub_node = helper.make_node(
+            "Sub",
+            inputs=["one", "cast_out"],
+            outputs=["sub_out"],
+            name="Sub_mask",
+        )
+        where_node = helper.make_node(
+            "Where",
+            inputs=["zero", "neg_int_max", "sub_out"],
+            outputs=["where_out" if model_type != "past" else "attn_mask_out"],
+            name="Where_mask",
+        )
+        attn_mask_nodes = [unsqueeze_1_node, unsqueeze_2_node, expand_node, cast_node, sub_node, where_node]
+
+        if model_type == "past":
+            return attn_mask_nodes
+
+        add_node = helper.make_node(
+            "Add",
+            inputs=["where_out", "zero"],
+            outputs=["attn_mask_out"],
+            name="Add_mask",
+        )
+        return attn_mask_nodes + [add_node]  # noqa: RUF005
+
+    def create_attn_mask_path(self, is_fused: bool, model_type: str):
+        if model_type == "llama2_msft":
+            attn_mask_nodes = self.create_attn_mask_path_llama2_msft()
+            if is_fused:
+                attn_mask_nodes.pop()
+                attn_mask_nodes[-1].output[0] = "attn_mask_out"
+            return attn_mask_nodes
+
+        attn_mask_nodes = self.create_attn_mask_path_hf(model_type)
+        if is_fused:
+            new_output_name = "attn_mask_out_mask"
+            attn_mask_nodes[-1].output[0] = new_output_name
+            concat_mask_node = helper.make_node(
+                "Concat",
+                inputs=[new_output_name for _ in range(self.num_heads)],
+                outputs=["attn_mask_out"],
+                name="Concat_mask",
+                axis=0,
+            )
+            attn_mask_nodes.append(concat_mask_node)
+        return attn_mask_nodes
+
+    def create_qk_path(self, model_type: str):
+        matmul_qk_node = helper.make_node(
+            "MatMul",
+            inputs=["q" if model_type == "llama2_msft" else "q_rope", "k"],
+            outputs=["qk"],
+            name="MatMul_q_k",
+        )
+        div_node = helper.make_node(
+            "Div",
+            inputs=["qk", "sqrt_head_size"],
+            outputs=["qk_div"],
+            name="Div_0",
+        )
+        add_node = helper.make_node(
+            "Add",
+            inputs=["qk_div", "attn_mask_out"],
+            outputs=["qk_plus_mask"],
+            name="Add_0",
+        )
+        softmax_node = helper.make_node(
+            "Softmax",
+            inputs=["qk_plus_mask"],
+            outputs=["softmax_out"],
+            name="Softmax_0",
+        )
+        return [matmul_qk_node, div_node, add_node, softmax_node]
+
+    def create_v_path(self, model_type: str):
+        reshape_v_1_node = helper.make_node(
+            "Reshape",
+            inputs=["v_out", "concat_v_1_extra_out"],
+            outputs=["reshape_v_1_out"],
+            name="Reshape_v_1",
+        )
+        transpose_v_1_node = helper.make_node(
+            "Transpose",
+            inputs=["reshape_v_1_out"],
+            outputs=["transpose_v_1_out" if model_type != "no_past" else "present_value"],
+            name="Transpose_v_1",
+        )
+        v_nodes = [reshape_v_1_node, transpose_v_1_node]
+
+        if model_type == "no_past":
+            return v_nodes
+
+        if model_type in {"past", "merged"}:
+            concat_v_node = helper.make_node(
+                "Concat",
+                inputs=["past_value", "transpose_v_1_out"],
+                outputs=["present_value"],
+                name="Concat_v",
+                axis=2,
+            )
+            return v_nodes + [concat_v_node]  # noqa: RUF005
+
+        # Create extra nodes for `position_ids`
+        unsqueeze_v_node = helper.make_node(
+            "Unsqueeze",
+            inputs=["position_ids", "zero"],
+            outputs=["unsqueeze_v_out"],
+            name="Unsqueeze_v",
+        )
+        slice_v_node = helper.make_node(
+            "Slice",
+            inputs=["past_value", "zero", "unsqueeze_v_out", "two", "one"],
+            outputs=["v_cache_sliced_out"],
+            name="Slice_v",
+        )
+        concat_v_node = helper.make_node(
+            "Concat",
+            inputs=["v_cache_sliced_out", "transpose_v_1_out"],
+            outputs=["present_value"],
+            name="Concat_v",
+            axis=2,
+        )
+        v_nodes.extend([unsqueeze_v_node, slice_v_node, concat_v_node])
+
+        # Create remaining nodes for v path
+        transpose_v_2_node = helper.make_node(
+            "Transpose",
+            inputs=["present_value"],
+            outputs=["transpose_v_2_out"],
+            name="Transpose_v_2",
+        )
+        reshape_v_2_node = helper.make_node(
+            "Reshape",
+            inputs=["transpose_v_2_out", "concat_v_2_extra_out"],
+            outputs=["v"],
+            name="Reshape_v_2",
+        )
+        return v_nodes + [transpose_v_2_node, reshape_v_2_node]  # noqa: RUF005
+
+    def create_qkv_path(self, model_type: str):
+        matmul_qkv_node = helper.make_node(
+            "MatMul",
+            inputs=["softmax_out", "v" if model_type == "llama2_msft" else "present_value"],
+            outputs=["softmax_v_out"],
+            name="MatMul_softmax_v",
+        )
+        qkv_nodes = [matmul_qkv_node]
+
+        if model_type == "llama2_msft":
+            reshape_qkv_1_node = helper.make_node(
+                "Reshape",
+                inputs=["softmax_v_out", "concat_qkv_1_extra_out"],
+                outputs=["reshape_qkv_1_out"],
+                name="Reshape_qkv_1",
+            )
+            qkv_nodes.append(reshape_qkv_1_node)
+
+        transpose_qkv_node = helper.make_node(
+            "Transpose",
+            inputs=["reshape_qkv_1_out" if model_type == "llama2_msft" else "softmax_v_out"],
+            outputs=["transpose_qkv_out"],
+            name="Transpose_qkv",
+        )
+        reshape_qkv_2_node = helper.make_node(
+            "Reshape",
+            inputs=["transpose_qkv_out", "concat_qkv_2_extra_out"],
+            outputs=["attn_output"],
+            name="Reshape_qkv_2",
+        )
+
+        return qkv_nodes + [transpose_qkv_node, reshape_qkv_2_node]  # noqa: RUF005
+
+    def create_concat_unsqueeze_paths(self, model_type: str, reshape_nodes: List[NodeProto]):
+        # Create initial shape paths
+        shape_0_node = helper.make_node(
+            "Shape",
+            inputs=["input_0"],
+            outputs=["input_0_shape_0"],
+            name="Shape_0",
+        )
+        gather_0_node = helper.make_node(
+            "Gather",
+            inputs=["input_0_shape_0", "zero"],
+            outputs=["input_0_shape_0_indexed"],
+            name="Gather_0",
+            axis=0,
+        )
+        shape_1_node = helper.make_node(
+            "Shape",
+            inputs=["input_0"],
+            outputs=["input_0_shape_1"],
+            name="Shape_1",
+        )
+        gather_1_node = helper.make_node(
+            "Gather",
+            inputs=["input_0_shape_1", "one"],
+            outputs=["input_0_shape_1_indexed"],
+            name="Gather_1",
+            axis=0,
+        )
+        extra_nodes = [shape_0_node, gather_0_node, shape_1_node, gather_1_node]
+
+        if model_type == "llama2_msft":
+            mul_node = helper.make_node(
+                "Mul",
+                inputs=[gather_0_node.output[0], "num_heads"],
+                outputs=["mul_extra_out"],
+                name="Mul_extra_0",
+            )
+            add_node = helper.make_node(
+                "Add",
+                inputs=[gather_1_node.output[0], "position_ids"],
+                outputs=["add_extra_out"],
+                name="Add_extra_0",
+            )
+            extra_nodes.extend([mul_node, add_node])
+
+        for i, reshape_node in enumerate(reshape_nodes):
+            use_mul_and_add_nodes_0 = model_type == "llama2_msft" and reshape_node.output[0] in {"q", "k", "v"}
+            use_mul_and_add_nodes_1 = model_type == "llama2_msft" and reshape_node.output[0] in {"k", "v"}
+
+            unsqueeze_0_node = helper.make_node(
+                "Unsqueeze",
+                inputs=[gather_0_node.output[0] if not use_mul_and_add_nodes_0 else "mul_extra_out", "zero"],
+                outputs=[f"unsqueeze_extra_{2*i}"],
+                name=f"Unsqueeze_extra_{2*i}",
+            )
+            unsqueeze_1_node = helper.make_node(
+                "Unsqueeze",
+                inputs=[gather_1_node.output[0] if not use_mul_and_add_nodes_1 else "add_extra_out", "zero"],
+                outputs=[f"unsqueeze_extra_{2*i + 1}"],
+                name=f"Unsqueeze_extra_{2*i + 1}",
+            )
+
+            reshape_name = reshape_node.name
+            if reshape_name == "Reshape_qkv_2":
+                concat_node_inputs = [unsqueeze_0_node.output[0], unsqueeze_1_node.output[0], "hidden_size"]
+            elif reshape_name == "Reshape_qkv_1":
+                concat_node_inputs = [unsqueeze_0_node.output[0], "num_heads", unsqueeze_1_node.output[0], "head_size"]
+            elif reshape_name == "Reshape_v_2":
+                concat_node_inputs = [unsqueeze_0_node.output[0], unsqueeze_1_node.output[0], "head_size"]
+            elif reshape_name == "Reshape_v_1":
+                concat_node_inputs = [unsqueeze_0_node.output[0], unsqueeze_1_node.output[0], "num_heads", "head_size"]
+            elif reshape_name == "Reshape_k":
+                concat_node_inputs = [unsqueeze_0_node.output[0], "head_size", unsqueeze_1_node.output[0]]
+            elif reshape_name == "Reshape_q":
+                concat_node_inputs = [unsqueeze_0_node.output[0], unsqueeze_1_node.output[0], "head_size"]
+
+            concat_node = helper.make_node(
+                "Concat",
+                inputs=concat_node_inputs,
+                outputs=[reshape_nodes[i].input[1]],
+                name=f"Concat_extra_{i}",
+                axis=0,
+            )
+            extra_nodes.extend([unsqueeze_0_node, unsqueeze_1_node, concat_node])
+
+        return extra_nodes
+
+    def create_end_nodes(self):
+        matmul_o_node = helper.make_node(
+            "MatMul",
+            inputs=["attn_output", "o_weight"],
+            outputs=["output_proj"],
+            name="MatMul_o_proj",
+        )
+        end_node = helper.make_node(
+            "Add",
+            inputs=["zero", "output_proj"],
+            outputs=["output_0"],
+            name="Add_normalize_node",
+        )
+        return [matmul_o_node, end_node]
+
+    def create_fused_model(self, model_type: str, interleaved: bool, initializers: List[TensorProto]):
+        inputs, outputs = self.create_inputs_and_outputs(model_type)
+        matmul_nodes = self.create_matmul_nodes(True, model_type=model_type)
+        rope_nodes = self.create_rotary_embeddings(True, model_type, interleaved, inputs, initializers)
+        attn_mask_nodes = self.create_attn_mask_path(True, model_type)
+
+        mha_inputs = [
+            rope_nodes[0].output[0],  # q
+            rope_nodes[1].output[0],  # k
+            matmul_nodes[-1].output[0],  # v
+            "",  # bias
+            "attn_mask_out" if model_type == "llama2_msft" else "",  # attn_mask
+            "attn_mask_out" if model_type != "llama2_msft" else "",  # add_qk
+            "past_key" if model_type != "no_past" else "",  # past_key
+            "past_value" if model_type != "no_past" else "",  # past_value
+        ]
+        mha_node = helper.make_node(
+            "MultiHeadAttention",
+            inputs=mha_inputs,
+            outputs=["attn_output", "present_key", "present_value"],
+            name="MultiHeadAttention_0",
+            num_heads=self.num_heads,
+        )
+
+        end_nodes = self.create_end_nodes()
+
+        graph = helper.make_graph(
+            nodes=matmul_nodes + rope_nodes + attn_mask_nodes + [mha_node] + end_nodes,
+            name="RotaryAttention_Graph",
+            inputs=inputs,
+            outputs=outputs,
+            initializer=initializers,
+        )
+        opset_import = helper.make_opsetid(domain="com.microsoft", version=1)
+        model = helper.make_model(graph, opset_imports=[opset_import])
+        return model
+
+    def create_test_model(self, model_type: str, interleaved: bool, initializers: List[TensorProto]):
+        inputs, outputs = self.create_inputs_and_outputs(model_type)
+        matmul_nodes = self.create_matmul_nodes(False, model_type)
+        rope_nodes = self.create_rotary_embeddings(False, model_type, interleaved, inputs, initializers)
+
+        # Create main paths
+        q_nodes = self.create_q_path(model_type)
+        k_nodes = self.create_k_path(model_type)
+        attn_mask_nodes = self.create_attn_mask_path(False, model_type)
+        qk_nodes = self.create_qk_path(model_type)
+        v_nodes = self.create_v_path(model_type)
+        qkv_nodes = self.create_qkv_path(model_type)
+
+        reshape_nodes = list(filter(lambda node: node.op_type == "Reshape", q_nodes + k_nodes + v_nodes + qkv_nodes))
+        extra_nodes = self.create_concat_unsqueeze_paths(model_type, reshape_nodes)
+
+        end_nodes = self.create_end_nodes()
+
+        first_set_of_nodes = matmul_nodes + rope_nodes + q_nodes + k_nodes + attn_mask_nodes
+        second_set_of_nodes = qk_nodes + v_nodes + qkv_nodes + extra_nodes + end_nodes
+        graph = helper.make_graph(
+            nodes=first_set_of_nodes + second_set_of_nodes,
+            name="RotaryAttention_Graph",
+            inputs=inputs,
+            outputs=outputs,
+            initializer=initializers,
+        )
+        opset_import = helper.make_opsetid(domain="ai.onnx", version=17)
+        model = helper.make_model(graph, opset_imports=[opset_import])
+        return model
+
+    def check_models(self, model_type: str, interleaved: bool):
+        initializers = self.create_initializers()
+
+        expected_model_filename = "expected_model.onnx"
+        expected_model = self.create_fused_model(model_type, interleaved, initializers)
+        onnx.save(expected_model, expected_model_filename)
+
+        original_model_filename = "original_model.onnx"
+        original_model = self.create_test_model(model_type, interleaved, initializers)
+        onnx.save(original_model, original_model_filename)
+
+        self.verify_fusion(expected_model_filename, original_model_filename)
+        os.remove(expected_model_filename)
+        os.remove(original_model_filename)
+
+    def test_llama2_msft_model(self):
+        model_type = "llama2_msft"
+        interleaved = True
+        self.check_models(model_type, interleaved)
+
+    def test_hf_decoder_model(self):
+        model_type = "no_past"
+        interleaved = False
+        self.check_models(model_type, interleaved)
+
+    def test_hf_decoder_with_past_model(self):
+        model_type = "past"
+        interleaved = False
+        self.check_models(model_type, interleaved)
+
+    def test_hf_decoder_merged_model(self):
+        model_type = "merged"
+        interleaved = False
+        self.check_models(model_type, interleaved)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_simplified_layernorm_fusion.py b/onnxruntime/test/python/transformers/test_simplified_layernorm_fusion.py
new file mode 100644
index 0000000000000..e86bdda7baffb
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_simplified_layernorm_fusion.py
@@ -0,0 +1,243 @@
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import os
+import unittest
+from typing import List
+
+import numpy as np
+import onnx
+from onnx import TensorProto, helper
+from parity_utilities import find_transformers_source
+
+if find_transformers_source():
+    from fusion_options import FusionOptions
+    from onnx_model import OnnxModel
+    from optimizer import optimize_model
+else:
+    from onnxruntime.transformers.fusion_options import FusionOptions
+    from onnxruntime.transformers.onnx_model import OnnxModel
+    from onnxruntime.transformers.optimizer import optimize_model
+
+
+def float_tensor(name: str, shape: List[int], random=False):
+    low = 0.0
+    high = 1.0
+    total_elements = 1
+    for x in shape:
+        total_elements *= x
+    weights = [np.random.uniform(low, high) for _ in range(total_elements)] if random else [1.0] * total_elements
+    return helper.make_tensor(name, TensorProto.FLOAT, shape, weights)
+
+
+class TestSimplifiedLayerNormFusion(unittest.TestCase):
+    def setUp(self):
+        self.vocab_size = 5
+        self.batch_size = 2
+        self.sequence_length = 8
+        self.hidden_size = 16
+        self.epsilon = 0.000009999999747378752
+
+    def verify_fusion(self, expected_model_path, original_model_path):
+        expected_model = OnnxModel(onnx.load(expected_model_path))
+        expected_model.topological_sort(is_deterministic=True)
+
+        options = FusionOptions("gpt2")
+        optimized_model = optimize_model(original_model_path, optimization_options=options)
+        optimized_model.topological_sort(is_deterministic=True)
+
+        self.assertTrue(str(expected_model.model.graph), str(optimized_model.model.graph))
+
+    def create_initializers(self, use_embed_weight: bool = False):
+        initializers = [
+            helper.make_tensor("Two", TensorProto.FLOAT, [1], np.array([2], dtype=np.float32)),
+            helper.make_tensor("epsilon", TensorProto.FLOAT, [1], np.array([self.epsilon], dtype=np.float32)),
+            helper.make_tensor("One", TensorProto.FLOAT, [1], np.array([1], dtype=np.float32)),
+            float_tensor("scale", [self.hidden_size]),
+        ]
+        if use_embed_weight:
+            initializers = [  # noqa: RUF005
+                float_tensor("embed_weight", [self.vocab_size, self.hidden_size])
+            ] + initializers
+        return initializers
+
+    def create_inputs_and_outputs(self, start_node_type: str):
+        inputs, start_node = None, None
+        if start_node_type == "Add":
+            start_node = helper.make_node(
+                "Add",
+                inputs=["input_0", "input_1"],
+                outputs=["D"],
+                name="Add_0",
+            )
+            input_0 = helper.make_tensor_value_info(
+                "input_0",
+                TensorProto.FLOAT,
+                [self.batch_size, self.sequence_length, self.hidden_size],
+            )
+            input_1 = helper.make_tensor_value_info(
+                "input_1",
+                TensorProto.FLOAT,
+                [self.batch_size, self.sequence_length, self.hidden_size],
+            )
+            inputs = [input_0, input_1]
+        elif start_node_type == "Gather":
+            start_node = helper.make_node(
+                "Gather",
+                inputs=["embed_weight", "input_0"],
+                outputs=["D"],
+                name="Gather_0",
+            )
+            input_0 = helper.make_tensor_value_info(
+                "input_0",
+                TensorProto.INT64,
+                [self.batch_size, self.sequence_length],
+            )
+            inputs = [input_0]
+        else:
+            # start_node_type is a graph input
+            assert start_node_type == "GraphInput"
+            input_0 = helper.make_tensor_value_info(
+                "D",
+                TensorProto.FLOAT,
+                [self.batch_size, self.sequence_length, self.hidden_size],
+            )
+            inputs = [input_0]
+
+        outputs = [
+            helper.make_tensor_value_info(
+                "output_0",
+                TensorProto.FLOAT,
+                [self.batch_size, self.sequence_length, self.hidden_size],
+            )
+        ]
+        return inputs, outputs, start_node
+
+    def create_fused_model(self, start_node_type: str, initializers: List[TensorProto]):
+        inputs, outputs, start_node = self.create_inputs_and_outputs(start_node_type)
+
+        sln_node = helper.make_node(
+            "SimplifiedLayerNormalization",
+            inputs=[start_node.output[0] if start_node is not None else "D", initializers[0].name],
+            outputs=[outputs[0].name],
+            axis=-1,
+            epsilon=initializers[2].float_data[0],
+            stash_type=1,
+        )
+
+        graph = helper.make_graph(
+            nodes=[sln_node] + ([] if start_node is None else [start_node]),
+            name="SimplifiedLayerNorm_Graph",
+            inputs=inputs,
+            outputs=outputs,
+            initializer=initializers,
+        )
+        opset_import = helper.make_opsetid(domain="com.microsoft", version=1)
+        model = helper.make_model(graph, opset_imports=[opset_import])
+        return model
+
+    # Notation follows https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary
+    def create_test_model(self, start_node_type: str, first_parent_idx: int, initializers: List[TensorProto]):
+        end_node = helper.make_node(
+            "Mul",
+            inputs=["scale", "Normalized"] if first_parent_idx == 1 else ["Normalized", "scale"],
+            outputs=["output_0"],
+            name="Mul_1",
+        )
+        mul_node = helper.make_node(
+            "Mul",
+            inputs=["D", "InvStdDev"],
+            outputs=["Normalized"],
+            name="Mul_0",
+        )
+        div_node = helper.make_node(
+            "Div",
+            inputs=["One", "StdDev"],
+            outputs=["InvStdDev"],
+            name="Div_0",
+        )
+        sqrt_node = helper.make_node(
+            "Sqrt",
+            inputs=["VarEps"],
+            outputs=["StdDev"],
+            name="Sqrt_0",
+        )
+        add_node = helper.make_node(
+            "Add",
+            inputs=["Var", "epsilon"],
+            outputs=["VarEps"],
+            name="Add_1",
+        )
+        reducemean_node = helper.make_node(
+            "ReduceMean",
+            inputs=["DD"],
+            outputs=["Var"],
+            name="ReduceMean_0",
+        )
+        pow_node = helper.make_node(
+            "Pow",
+            inputs=["D", "Two"],
+            outputs=["DD"],
+            name="Pow_0",
+        )
+
+        inputs, outputs, start_node = self.create_inputs_and_outputs(start_node_type)
+
+        main_nodes = [pow_node, reducemean_node, add_node, sqrt_node, div_node, mul_node, end_node]
+        graph = helper.make_graph(
+            nodes=main_nodes + ([] if start_node is None else [start_node]),
+            name="SimplifiedLayerNorm_Graph",
+            inputs=inputs,
+            outputs=outputs,
+            initializer=initializers,
+        )
+        opset_import = helper.make_opsetid(domain="com.microsoft", version=1)
+        model = helper.make_model(graph, opset_imports=[opset_import])
+        return model
+
+    def check_models(self, start_node_type: str, first_parent_idx: int, initializers: List[TensorProto]):
+        expected_model_filename = "expected_model.onnx"
+        expected_model = self.create_fused_model(start_node_type, initializers)
+        onnx.save(expected_model, expected_model_filename)
+
+        original_model_filename = "original_model.onnx"
+        original_model = self.create_test_model(start_node_type, first_parent_idx, initializers)
+        onnx.save(original_model, original_model_filename)
+
+        self.verify_fusion(expected_model_filename, original_model_filename)
+        os.remove(expected_model_filename)
+        os.remove(original_model_filename)
+
+    # sim_ln_nodes_1
+    def test_simplified_layernorm_add_idx1(self):
+        start_node_type = "Add"
+        first_parent_idx = 1
+        initializers = self.create_initializers()
+        self.check_models(start_node_type, first_parent_idx, initializers)
+
+    # sim_ln_nodes_2
+    def test_simplified_layernorm_gather_idx1(self):
+        start_node_type = "Gather"
+        first_parent_idx = 1
+        initializers = self.create_initializers(use_embed_weight=True)
+        self.check_models(start_node_type, first_parent_idx, initializers)
+
+    # sim_ln_nodes_3
+    def test_simplified_layernorm_add_idx0(self):
+        start_node_type = "Add"
+        first_parent_idx = 0
+        initializers = self.create_initializers()
+        self.check_models(start_node_type, first_parent_idx, initializers)
+
+    # sim_ln_nodes_4
+    def test_simplified_layernorm_gather_graph_input(self):
+        start_node_type = "GraphInput"
+        first_parent_idx = 0
+        initializers = self.create_initializers()
+        self.check_models(start_node_type, first_parent_idx, initializers)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_whisper.py b/onnxruntime/test/python/transformers/test_whisper.py
index ebda0bccaadcf..ceda5a88c3925 100644
--- a/onnxruntime/test/python/transformers/test_whisper.py
+++ b/onnxruntime/test/python/transformers/test_whisper.py
@@ -50,7 +50,7 @@ def verify_fusion(self, optimized_model, expected_model_filename):
                 )
             )
 
-    # Attention type #1 in onnx_model_bart.py
+    # Attention type #1 in fusion_bart_attention.py
     def test_encoder_attention_fusion_with_skiplayernorm(self):
         num_heads = 4
         hidden_size = 64
@@ -67,7 +67,7 @@ def test_encoder_attention_fusion_with_skiplayernorm(self):
         os.remove(model_path)
         self.verify_fusion(optimized_model, "encoder_attention_with_sln_fused.onnx")
 
-    # Attention type #2 in onnx_model_bart.py
+    # Attention type #2 in fusion_bart_attention.py
     def test_decoder_attention_fusion_with_skiplayernorm(self):
         num_heads = 4
         hidden_size = 64
@@ -84,7 +84,7 @@ def test_decoder_attention_fusion_with_skiplayernorm(self):
         os.remove(model_path)
         self.verify_fusion(optimized_model, "decoder_attention_with_sln_fused.onnx")
 
-    # Attention type #4 in onnx_model_bart.py
+    # Attention type #4 in fusion_bart_attention.py
     def test_decoder_multihead_attention_fusion(self):
         num_heads = 4
         hidden_size = 64
@@ -100,7 +100,7 @@ def test_decoder_multihead_attention_fusion(self):
         os.remove(model_path)
         self.verify_fusion(optimized_model, "decoder_mha_fused.onnx")
 
-    # Attention type #3 in onnx_model_bart.py
+    # Attention type #3 in fusion_bart_attention.py
     def test_decoder_with_past_multihead_self_attention_fusion_with_skiplayernorm(self):
         num_heads = 4
         hidden_size = 64
@@ -118,7 +118,7 @@ def test_decoder_with_past_multihead_self_attention_fusion_with_skiplayernorm(se
         os.remove(model_path)
         self.verify_fusion(optimized_model, "decoder_with_past_self_mha_fused.onnx")
 
-    # Attention type #5 in onnx_model_bart.py
+    # Attention type #5 in fusion_bart_attention.py
     def test_decoder_with_past_multihead_cross_attention_fusion(self):
         num_heads = 4
         hidden_size = 64
@@ -134,7 +134,7 @@ def test_decoder_with_past_multihead_cross_attention_fusion(self):
         os.remove(model_path)
         self.verify_fusion(optimized_model, "decoder_with_past_cross_mha_fused.onnx")
 
-    # Attention type #4 in onnx_model_bart.py
+    # Attention type #4 in fusion_bart_attention.py
     def test_decoder_multihead_attention_split_bias_fusion(self):
         num_heads = 4
         hidden_size = 64
@@ -151,7 +151,7 @@ def test_decoder_multihead_attention_split_bias_fusion(self):
         os.remove(model_path)
         self.verify_fusion(optimized_model, "decoder_mha_split_bias_fused.onnx")
 
-    # Attention type #3 in onnx_model_bart.py
+    # Attention type #3 in fusion_bart_attention.py
     def test_decoder_with_past_multihead_self_attention_split_bias_fusion_with_skiplayernorm(self):
         num_heads = 4
         hidden_size = 64
@@ -171,7 +171,7 @@ def test_decoder_with_past_multihead_self_attention_split_bias_fusion_with_skipl
         os.remove(model_path)
         self.verify_fusion(optimized_model, "decoder_with_past_self_mha_split_bias_fused.onnx")
 
-    # Attention type #5 in onnx_model_bart.py
+    # Attention type #5 in fusion_bart_attention.py
     def test_decoder_with_past_multihead_cross_attention_split_bias_fusion(self):
         num_heads = 4
         hidden_size = 64
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 565bf068e7abd..ba282193c5ca6 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -2832,6 +2832,132 @@ TEST(CApiTest, ConfigureCudaArenaAndDemonstrateMemoryArenaShrinkage) {
 #endif
 
 #ifdef USE_TENSORRT
+TEST(CApiTest, TestExternalCUDAStreamWithIOBinding) {
+  const auto& api = Ort::GetApi();
+  Ort::SessionOptions session_options;
+
+  OrtTensorRTProviderOptionsV2* trt_options;
+  ASSERT_TRUE(api.CreateTensorRTProviderOptions(&trt_options) == nullptr);
+  std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)>
+      rel_trt_options(trt_options, api.ReleaseTensorRTProviderOptions);
+
+  // updating provider option with user provided compute stream
+  cudaStream_t compute_stream = nullptr;
+  void* user_compute_stream = nullptr;
+  cudaStreamCreate(&compute_stream);
+  ASSERT_TRUE(api.UpdateTensorRTProviderOptionsWithValue(rel_trt_options.get(), "user_compute_stream", compute_stream) == nullptr);
+  ASSERT_TRUE(api.GetTensorRTProviderOptionsByName(rel_trt_options.get(), "user_compute_stream", &user_compute_stream) == nullptr);
+  ASSERT_TRUE(user_compute_stream == (void*)compute_stream);
+
+  ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(
+                  static_cast<OrtSessionOptions*>(session_options),
+                  rel_trt_options.get()) == nullptr);
+
+  Ort::Session session(*ort_env, MODEL_URI, session_options);
+  Ort::MemoryInfo info_cuda("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
+
+  const std::array<int64_t, 2> x_shape = {3, 2};
+  std::array<float, 3 * 2> x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  /*
+   * Use cudaMallocHost() (pinned memory allocation) to create input/output tensors
+   */
+  float* input_data;
+  cudaMallocHost(&input_data, 3 * 2 * sizeof(float));
+  ASSERT_NE(input_data, nullptr);
+  cudaMemcpy(input_data, x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
+
+  std::cout << "pinned memory allocation" << std::endl;
+  std::cout << "input tesnor:" << std::endl;
+  for (int i = 0; i < 6; i++) {
+    std::cout << input_data[i] << std::endl;
+  }
+
+  // Create an OrtValue tensor backed by data on CUDA memory
+  Ort::Value bound_x = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(input_data), x_values.size(),
+                                                x_shape.data(), x_shape.size());
+
+  const std::array<int64_t, 2> expected_y_shape = {3, 2};
+  std::array<float, 3 * 2> expected_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  float* output_data;
+  cudaMallocHost(&output_data, 3 * 2 * sizeof(float));
+  ASSERT_NE(output_data, nullptr);
+
+  // Create an OrtValue tensor backed by data on CUDA memory
+  Ort::Value bound_y = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(output_data),
+                                                expected_y.size(), expected_y_shape.data(), expected_y_shape.size());
+
+  // Create IoBinding for inputs and outputs.
+  Ort::IoBinding binding(session);
+  binding.BindInput("X", bound_x);
+  binding.BindOutput("Y", bound_y);
+
+  /*
+   * Use cudaMalloc() (pageable memory allocation first and then implicit pinned memory allocation) to create input/output tensors
+   */
+  float* input_data_2;
+  cudaMalloc(&input_data_2, 3 * 2 * sizeof(float));
+  ASSERT_NE(input_data_2, nullptr);
+  cudaMemcpy(input_data_2, x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
+
+  // Create an OrtValue tensor backed by data on CUDA memory
+  Ort::Value bound_x_2 = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(input_data_2), x_values.size(),
+                                                  x_shape.data(), x_shape.size());
+
+  float* output_data_2;
+  cudaMalloc(&output_data_2, 3 * 2 * sizeof(float));
+  ASSERT_NE(output_data_2, nullptr);
+
+  // Create an OrtValue tensor backed by data on CUDA memory
+  Ort::Value bound_y_2 = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(output_data_2),
+                                                  expected_y.size(), expected_y_shape.data(), expected_y_shape.size());
+
+  // Create IoBinding for inputs and outputs.
+  Ort::IoBinding binding_2(session);
+  binding_2.BindInput("X", bound_x_2);
+  binding_2.BindOutput("Y", bound_y_2);
+
+  // Run with first iobindings
+  session.Run(Ort::RunOptions(), binding);
+
+  // Check the values against the bound raw memory (needs copying from device to host first)
+  std::array<float, 3 * 2> y_values;
+  cudaMemcpy(y_values.data(), output_data, sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
+
+  std::cout << "pinned memory allocation" << std::endl;
+  std::cout << "output: " << std::endl;
+  for (auto y : y_values) {
+    std::cout << y << std::endl;
+  }
+  ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));
+
+  // Run with second iobindings
+  session.Run(Ort::RunOptions(), binding_2);
+
+  // Check the values against the bound raw memory (needs copying from device to host first)
+  cudaMemcpy(y_values.data(), output_data_2, sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
+
+  std::cout << "pageable memory allocation" << std::endl;
+  std::cout << "output: " << std::endl;
+  for (auto y : y_values) {
+    std::cout << y << std::endl;
+  }
+  ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));
+
+  // Clean up
+  binding.ClearBoundInputs();
+  binding.ClearBoundOutputs();
+  binding_2.ClearBoundInputs();
+  binding_2.ClearBoundOutputs();
+
+  cudaFreeHost(input_data);
+  cudaFreeHost(output_data);
+  cudaFree(input_data_2);
+  cudaFree(output_data_2);
+  cudaStreamDestroy(compute_stream);
+}
+
 class CApiTensorRTTest : public testing::Test, public ::testing::WithParamInterface<std::string> {};
 
 // This test uses CreateTensorRTProviderOptions/UpdateTensorRTProviderOptions APIs to configure and create a TensorRT Execution Provider
@@ -2849,15 +2975,6 @@ TEST_P(CApiTensorRTTest, TestConfigureTensorRTProviderOptions) {
   ASSERT_TRUE(api.CreateTensorRTProviderOptions(&trt_options) == nullptr);
   std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(trt_options, api.ReleaseTensorRTProviderOptions);
 
-  // Only test updating provider option with user provided compute stream
-  cudaStream_t compute_stream = nullptr;
-  void* user_compute_stream = nullptr;
-  cudaStreamCreateWithFlags(&compute_stream, cudaStreamNonBlocking);
-  ASSERT_TRUE(api.UpdateTensorRTProviderOptionsWithValue(rel_trt_options.get(), "user_compute_stream", compute_stream) == nullptr);
-  ASSERT_TRUE(api.GetTensorRTProviderOptionsByName(rel_trt_options.get(), "user_compute_stream", &user_compute_stream) == nullptr);
-  ASSERT_TRUE(user_compute_stream == (void*)compute_stream);
-  cudaStreamDestroy(compute_stream);
-
   const char* engine_cache_path = "./trt_engine_folder";
 
   std::vector<const char*> keys{"device_id", "has_user_compute_stream", "trt_fp16_enable", "trt_int8_enable", "trt_engine_cache_enable",
diff --git a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
index aba35b33b75c6..3d561d378cb8c 100644
--- a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) && !defined(ENABLE_TRAINING)
 
 #define ORT_API_MANUAL_INIT
 #include "onnxruntime_cxx_api.h"
@@ -32,6 +32,9 @@ void KernelOne(const Ort::Custom::CudaContext& cuda_ctx,
   CUSTOM_ENFORCE(cuda_ctx.cuda_stream, "failed to fetch cuda stream");
   CUSTOM_ENFORCE(cuda_ctx.cudnn_handle, "failed to fetch cudnn handle");
   CUSTOM_ENFORCE(cuda_ctx.cublas_handle, "failed to fetch cublas handle");
+  void* deferred_cpu_mem = cuda_ctx.AllocDeferredCpuMem(sizeof(int32_t));
+  CUSTOM_ENFORCE(deferred_cpu_mem, "failed to allocate deferred cpu allocator");
+  cuda_ctx.FreeDeferredCpuMem(deferred_cpu_mem);
   auto z_raw = Z.Allocate(input_shape);
   cuda_add(Z.NumberOfElement(), z_raw, X.Data(), Y.Data(), cuda_ctx.cuda_stream);
 }
@@ -43,8 +46,4 @@ void RegisterOps(Ort::CustomOpDomain& domain) {
 
 }  // namespace Cuda
 
-#else
-
-void Cuda::RegisterOps(Ort::CustomOpDomain& domain) {}
-
 #endif
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.h b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.h
index c0287c4932c98..35cd36fcd4cb7 100644
--- a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.h
+++ b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.h
@@ -5,6 +5,14 @@
 
 namespace Cuda {
 
+#if defined(USE_CUDA) && !defined(ENABLE_TRAINING)
+
 void RegisterOps(Ort::CustomOpDomain& domain);
 
-}
\ No newline at end of file
+#else
+
+void RegisterOps(Ort::CustomOpDomain&) {}
+
+#endif
+
+}  // namespace Cuda
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/custom_op_library/custom_op_library.cc b/onnxruntime/test/testdata/custom_op_library/custom_op_library.cc
index 40fb127eb0b8f..2d5ffc3c81b0f 100644
--- a/onnxruntime/test/testdata/custom_op_library/custom_op_library.cc
+++ b/onnxruntime/test/testdata/custom_op_library/custom_op_library.cc
@@ -13,6 +13,8 @@
 #include "core/framework/ortdevice.h"
 #include "core/framework/ortmemoryinfo.h"
 #include "cpu/cpu_ops.h"
+#include "cuda/cuda_ops.h"
+#include "rocm/rocm_ops.h"
 #include "onnxruntime_lite_custom_op.h"
 
 static const char* c_OpDomain = "test.customop";
@@ -31,10 +33,15 @@ OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtA
   ORT_TRY {
     Ort::CustomOpDomain domain{c_OpDomain};
     Cpu::RegisterOps(domain);
-
     Ort::CustomOpDomain domain_v2{"v2"};
     Cpu::RegisterOps(domain_v2);
 
+    Cuda::RegisterOps(domain);
+    Cuda::RegisterOps(domain_v2);
+
+    Rocm::RegisterOps(domain);
+    Rocm::RegisterOps(domain_v2);
+
     Ort::UnownedSessionOptions session_options(options);
     session_options.Add(domain);
     session_options.Add(domain_v2);
diff --git a/onnxruntime/test/testdata/custom_op_library/rocm/rocm_ops.cc b/onnxruntime/test/testdata/custom_op_library/rocm/rocm_ops.cc
index 113bfb85454a2..069246b4201e7 100644
--- a/onnxruntime/test/testdata/custom_op_library/rocm/rocm_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/rocm/rocm_ops.cc
@@ -19,7 +19,7 @@ using namespace Ort::Custom;
     throw std::runtime_error(msg); \
   }
 
-namespace Cuda {
+namespace Rocm {
 
 void KernelOne(const Ort::Custom::RocmContext& rocm_ctx,
                const Ort::Custom::Tensor<float>& X,
@@ -38,10 +38,6 @@ void RegisterOps(Ort::CustomOpDomain& domain) {
   domain.Add(c_CustomOpOne.get());
 }
 
-}  // namespace Cuda
-
-#else
-
-void Cuda::RegisterOps(Ort::CustomOpDomain& domain) {}
+}  // namespace Rocm
 
 #endif
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/custom_op_library/rocm/rocm_ops.h b/onnxruntime/test/testdata/custom_op_library/rocm/rocm_ops.h
index 4e8958cd9dae0..d3e9e4040a5c3 100644
--- a/onnxruntime/test/testdata/custom_op_library/rocm/rocm_ops.h
+++ b/onnxruntime/test/testdata/custom_op_library/rocm/rocm_ops.h
@@ -5,6 +5,14 @@
 
 namespace Rocm {
 
+#ifdef USE_ROCM
+
 void RegisterOps(Ort::CustomOpDomain& domain);
 
-}
\ No newline at end of file
+#else
+
+inline void RegisterOps(Ort::CustomOpDomain&) {}
+
+#endif
+
+}  // namespace Rocm
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index c142106ed506c..44db7c0078cfc 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -283,14 +283,22 @@
         "^test_dft_axis",
         "^test_dft",
         "^test_dft_inverse",
-        "^test_isinf",
-        "^test_isinf_float16",
-        "^test_isinf_negative",
-        "^test_isinf_positive",
-        "^test_isnan",
-        "^test_isnan_float16",
         "^test_reduce_max_bool_inputs",
-        "^test_reduce_min_bool_inputs"
+        "^test_reduce_min_bool_inputs",
+        "^test_reduce_min_empty_set",
+        "^test_reduce_l1_empty_set",
+        "^test_reduce_l1_empty_set_expanded",
+        "^test_reduce_l2_empty_set",
+        "^test_reduce_l2_empty_set_expanded",
+        "^test_reduce_log_sum_empty_set",
+        "^test_reduce_log_sum_empty_set_expanded",
+        "^test_reduce_log_sum_exp_empty_set",
+        "^test_reduce_log_sum_exp_empty_set_expanded",
+        "^test_reduce_prod_empty_set",
+        "^test_reduce_sum_empty_set",
+        "^test_reduce_sum_empty_set_non_reduced_axis_zero",
+        "^test_reduce_sum_square_empty_set",
+        "^test_reduce_sum_square_empty_set_expanded"
     ],
     "current_failing_tests_x86": [
         "^test_vgg19",
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-directly.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-directly.onnx
new file mode 100644
index 0000000000000..fa11adaac8d95
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-directly.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-non-ignorable-node.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-non-ignorable-node.onnx
new file mode 100644
index 0000000000000..1050a7285b4a6
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-non-ignorable-node.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-only-reshape.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-only-reshape.onnx
new file mode 100644
index 0000000000000..c361a42700a30
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-only-reshape.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-only-transpose.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-only-transpose.onnx
new file mode 100644
index 0000000000000..f70ae2e6229e7
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-only-transpose.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-with-reshape.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-with-reshape.onnx
new file mode 100644
index 0000000000000..8e4bc49514548
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-matmul-bn-with-reshape.onnx differ
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 28af61e15b2b5..e224507bc740e 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -268,7 +268,7 @@ std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
 #ifdef USE_DML
-  if (auto factory = DMLProviderFactoryCreator::Create(0))
+  if (auto factory = DMLProviderFactoryCreator::Create(0, false, false, false))
     return factory->CreateProvider();
 #endif
   return nullptr;
diff --git a/onnxruntime/test/util/include/inference_session_wrapper.h b/onnxruntime/test/util/include/inference_session_wrapper.h
index eab83c26b681f..757caf7987d35 100644
--- a/onnxruntime/test/util/include/inference_session_wrapper.h
+++ b/onnxruntime/test/util/include/inference_session_wrapper.h
@@ -12,9 +12,8 @@ namespace test {
 // InferenceSession wrapper class for use in tests where we need access to the Graph and SessionState
 class InferenceSessionWrapper : public InferenceSession {
  public:
-  explicit InferenceSessionWrapper(const SessionOptions& session_options,
-                                   const Environment& env) : InferenceSession(session_options, env) {
-  }
+  // Expose the constructors from InferenceSession
+  using InferenceSession::InferenceSession;
 
   const Graph& GetGraph() const {
     return model_->MainGraph();
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index 133cab71f2b1c..6547f53a3c2ae 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -2147,5 +2147,13 @@ IMPLEMENT_GRADIENT_BUILDER(GetScaledSumGradient) {
   ORT_THROW("ScaledSum gradient builder does not support ", input_count, " inputs");
 }
 
+IMPLEMENT_GRADIENT_BUILDER(GetResizeGradient) {
+  return std::vector<NodeDef>{
+      NodeDef(OpDef{"ResizeGrad", kMSDomain, 1},
+              {GO(0), I(0), I(1), I(2)},
+              {GI(0)},
+              SrcNodeAttributes())};
+}
+
 }  // namespace training
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/core/graph/gradient_builder.h b/orttraining/orttraining/core/graph/gradient_builder.h
index a517e8af13fcc..28a316261e2f6 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.h
+++ b/orttraining/orttraining/core/graph/gradient_builder.h
@@ -90,6 +90,7 @@ DECLARE_GRADIENT_BUILDER(GetGRUGradient)
 DECLARE_GRADIENT_BUILDER(GetReciprocalGradient)
 DECLARE_GRADIENT_BUILDER(GetLeakyReluGradient)
 DECLARE_GRADIENT_BUILDER(GetConvTransposeGradient)
+DECLARE_GRADIENT_BUILDER(GetResizeGradient)
 
 DECLARE_GRADIENT_BUILDER(GetExternalGradient)
 
diff --git a/orttraining/orttraining/core/graph/gradient_builder_registry.cc b/orttraining/orttraining/core/graph/gradient_builder_registry.cc
index 4062b5d097394..4b8c68aef078a 100755
--- a/orttraining/orttraining/core/graph/gradient_builder_registry.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder_registry.cc
@@ -122,6 +122,7 @@ void GradientBuilderRegistry::RegisterGradientBuilders() {
   REGISTER_GRADIENT_BUILDER("Reciprocal", GetReciprocalGradient);
   REGISTER_GRADIENT_BUILDER("LeakyRelu", GetLeakyReluGradient);
   REGISTER_GRADIENT_BUILDER("ConvTranspose", GetConvTransposeGradient);
+  REGISTER_GRADIENT_BUILDER("Resize", GetResizeGradient);
 
   REGISTER_GRADIENT_BUILDER("ExternalGradient", GetExternalGradient);
 };
diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc
index cfc79455c43ed..c90acfdb7bb78 100644
--- a/orttraining/orttraining/core/graph/training_op_defs.cc
+++ b/orttraining/orttraining/core/graph/training_op_defs.cc
@@ -5001,6 +5001,26 @@ Return true if all elements are true and false otherwise.
           "T",
           {"tensor(float16)", "tensor(float)", "tensor(double)"},
           "Constrain input and output types to float tensors.");
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(ResizeGrad)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .Input(0, "dY", "Gradient of output Y.", "T")
+      .Input(1, "X", "Input tensor to the Resize operator.", "T")
+      .Input(2, "roi", "The roi input to the Resize operator.", "T", OpSchema::Optional)
+      .Input(3, "scales", "The scales input to the Resize operator.", "tensor(float)", OpSchema::Optional)
+      .Output(0, "dX", "Gradient of the input X.", "T")
+      .AllowUncheckedAttributes()
+      .TypeConstraint(
+          "T",
+          {"tensor(float16)", "tensor(float)", "tensor(double)"},
+          "Constrain input and output types to float tensors.")
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        propagateElemTypeFromInputToOutput(ctx, 1, 0);
+        if (hasInputShape(ctx, 1)) {
+          propagateShapeFromInputToOutput(ctx, 1, 0);
+        }
+      });
 }
 
 }  // namespace training
diff --git a/orttraining/orttraining/python/training/optim/_ds_code_store.py b/orttraining/orttraining/python/training/optim/_ds_code_store.py
new file mode 100644
index 0000000000000..dc1e20bc3dcff
--- /dev/null
+++ b/orttraining/orttraining/python/training/optim/_ds_code_store.py
@@ -0,0 +1,81 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#
+# Copyright 2020 The Microsoft DeepSpeed Team
+#
+# !!!IMPORTANT: This file is a copy of the original one in DeepSpeed repo at given version,
+# It is used to compare with the source code of current installed DeepSpeed during runtime.
+# Please don't modify it or do any code formatting for it.
+# 'orttraining/orttraining/python/training/optim/_ds_code_store.py' is removed from lintrunner config by intention.
+# --------------------------------------------------------------------------
+
+# Wrap code in this to make sure the indentation is correct compared with raw DeepSpeed.
+
+class Stage1And2_DeepSpeedZeroOptimizer_0_9_2:
+
+    def has_overflow_serial(self, params, is_grad_list=False):
+        for p in params:
+            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
+                return True
+
+        return False
+
+
+    def get_grad_norm_direct(self, gradients, params, norm_type=2):
+        """Clips gradient norm of an iterable of parameters.
+
+        This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+        added functionality to handle model parallel parameters. Note that
+        the gradients are modified in place.
+
+        Arguments:
+            parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+                single Tensor that will have gradients normalized
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the parameters (viewed as a single vector).
+        """
+        norm_type = float(norm_type)
+        if norm_type == inf:
+            total_norm = max(g.data.abs().max() for g in gradients)
+            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group)
+
+            # Take max across all GPUs.
+            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
+            total_norm = total_norm_cuda[0].item()
+        else:
+            total_norm = 0.0
+            # if dist.get_rank() == 0:
+            #    logger.info(f"Total Norm beginning {total_norm}")
+            for g, p in zip(gradients, params):
+                # Pipeline parallelism may replicate parameters. Avoid multi-counting.
+                if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
+                    continue
+                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
+                    param_norm = g.data.double().norm(2)
+                    total_norm += param_norm.item()**2
+            # Sum across all model parallel GPUs.
+            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
+
+            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
+
+            total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
+            total_norm = -1
+
+        return total_norm
+
+
+    def has_overflow_partitioned_grads_serial(self):
+        for i in range(len(self.bit16_groups)):
+            for j, grad in enumerate(self.averaged_gradients[i]):
+                if grad is not None and self._has_inf_or_nan(grad.data, j):
+                    return True
+        return False
diff --git a/orttraining/orttraining/python/training/optim/_ds_modifier.py b/orttraining/orttraining/python/training/optim/_ds_modifier.py
index 6b1c98cc02a52..20f4f814e5476 100644
--- a/orttraining/orttraining/python/training/optim/_ds_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_ds_modifier.py
@@ -10,6 +10,9 @@
 # - has_overflow_partitioned_grads_serial : https://github.com/microsoft/DeepSpeed/blob/d8e9ef6f99e27bb95e10bd146d145b3372b4cfda/deepspeed/runtime/zero/stage2.py#L1799
 # --------------------------------------------------------------------------
 
+from __future__ import annotations
+
+import inspect
 import types
 import warnings
 
@@ -17,12 +20,69 @@
 from numpy import inf
 from packaging.version import Version
 
+from ._ds_code_store import Stage1And2_DeepSpeedZeroOptimizer_0_9_2
 from ._modifier import FP16OptimizerModifier, check_overflow, check_overflow_for_grads
 from ._multi_tensor_apply import MultiTensorApply
 
 multi_tensor_applier = MultiTensorApply(2048 * 32)
 
 
+def _get_normalized_str(function) -> str:
+    return inspect.getsource(function)
+
+
+def _dynamic_checks(cur_ds_version: Version, optimizer) -> bool:
+    _functions_to_override = ["has_overflow_serial", "get_grad_norm_direct", "has_overflow_partitioned_grads_serial"]
+
+    _version_to_source_code_map = {"0.9.2": Stage1And2_DeepSpeedZeroOptimizer_0_9_2}
+
+    # Try to find the biggest version that is smaller than or equal to cur_ds_version.
+    # then compare the source code (in case the found version is the latest version supported);
+    # If current code does not match the found version, return False, and raise a warning to
+    # add the new version to the list.
+    versions = [Version(v) for v in _version_to_source_code_map]
+    sorted_versions = sorted(versions, reverse=True)
+    version_to_compare = None
+    for sv in sorted_versions:
+        if cur_ds_version >= sv:
+            version_to_compare = sv
+            break
+
+    if version_to_compare is None:
+        warnings.warn(
+            "Unable to find a DeepSpeed version that is smaller than or equal to the current version "
+            f"{cur_ds_version}. Skip modifying optimizer.",
+            UserWarning,
+        )
+        return False
+
+    v_optimizer_cls = _version_to_source_code_map[str(version_to_compare)]
+    all_match = True
+    for func_name in _functions_to_override:
+        if not getattr(optimizer, func_name):
+            warnings.warn(
+                f"DeepSpeed function {func_name} is not found in optimizer. Skip modifying optimizer.", UserWarning
+            )
+            all_match = False
+        cur_code_str = _get_normalized_str(getattr(optimizer, func_name))
+        v_code_str = _get_normalized_str(getattr(v_optimizer_cls, func_name))
+        if cur_code_str != v_code_str:
+            warnings.warn(
+                f"DeepSpeed function {func_name} has changed after version {version_to_compare}. "
+                f"Please append new version {cur_ds_version} in _version_to_source_code_map and _ds_code_store.py.\n"
+                f"---[{func_name}] Old Source Code Start----\n"
+                f"{v_code_str}\n"
+                f"---{func_name} Old Source Code End----\n"
+                f"---[{func_name}] New Source Code Start----\n"
+                f"{cur_code_str}\n"
+                f"---{func_name} New Source Code End----",
+                UserWarning,
+            )
+            all_match = False
+
+    return all_match
+
+
 class DeepSpeedZeROModifier(FP16OptimizerModifier):
     def __init__(self, optimizer, **kwargs) -> None:
         super().__init__(optimizer)
@@ -30,19 +90,32 @@ def __init__(self, optimizer, **kwargs) -> None:
     def can_be_modified(self):
         import deepspeed
 
+        # Note 1:
         # This modifier relies on the implementation of has_overflow_serial, get_grad_norm_direct,
         # and has_overflow_partitioned_grads_serial
         # in https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage_1_and_2.py.
-        # Everytime if we want to update this version supporting list to a newer version,
-        # we need to check if the implementation of these functions are changed.
-        # An easy way to check is to check the history of this file, if there is no change during the update,
+        # The minimum version supported is 0.4.0, all versions in between [0.4.0, 0.9.1]
+        # are manually checked to make sure the implementation of these functions are "logically" not changed.
+        # The way we did the check is to check the history of this file, if there is no change during the update,
         # it's safe to update the version supporting list. Otherwise, or the file is moved or renamed,
         # we need to check the implementation of these functions in detail.
+        #
+        # Note 2:
+        # Since version 0.9.2, we added dynamic source code check, by comparing installed version of code with
+        # the source code in our code store. If the source code is changed, we will raise a warning to ask user
+        # to add the new version to the code store. Otherwise, we will override the functions.
+
         ds_version = Version(deepspeed.__version__)
-        if ds_version > Version("0.9.1") or ds_version < Version("0.4.0"):
+        if ds_version < Version("0.4.0"):
+            warnings.warn(
+                f"Skip modifying optimizer because of unsupported DeepSpeed version {ds_version}, "
+                "minimum supported version: 0.4.0, current version",
+                UserWarning,
+            )
+            return False
+        if ds_version > Version("0.9.1") and not _dynamic_checks(ds_version, self._optimizer):
             warnings.warn(
-                "Skip modifying optimizer because of unsupported DeepSpeed version {}, "
-                "supported version: 0.4.0 - 0.9.1.".format(deepspeed.__version__),
+                f"Skip modifying optimizer because of unsupported DeepSpeed version {ds_version}.",
                 UserWarning,
             )
             return False
diff --git a/orttraining/orttraining/python/training/optim/_modifier_registry.py b/orttraining/orttraining/python/training/optim/_modifier_registry.py
index 4a3a33ecc0513..a88740dac60b7 100644
--- a/orttraining/orttraining/python/training/optim/_modifier_registry.py
+++ b/orttraining/orttraining/python/training/optim/_modifier_registry.py
@@ -3,13 +3,59 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+from __future__ import annotations
+
+import warnings
+from typing import ClassVar
+
 from ._apex_amp_modifier import ApexAMPModifier
 from ._ds_modifier import DeepSpeedZeROModifier
 from ._megatron_modifier import LegacyMegatronLMModifier
+from ._modifier import FP16OptimizerModifier
+
+
+class _AccelerateDeepSpeedZeROModifier(DeepSpeedZeROModifier):
+    """
+    Modifier for wrapper of DeepSpeed Optimizer in accelerator.
+    https://github.com/huggingface/accelerate/blob/7843286f2e1c50735d259fbc0084a7f1c85e00e3/src/accelerate/utils/deepspeed.py#L182C19-L182C19
+    """
+
+    def __init__(self, accelerator_optimizer, **kwargs) -> None:
+        super().__init__(accelerator_optimizer.optimizer)
+
+
+def get_full_qualified_type_name(o):
+    klass = o.__class__
+    module = klass.__module__
+    if module == "builtins":
+        return klass.__qualname__
+    return module + "." + klass.__qualname__
+
+
+class OptimizerModifierTypeRegistry:
+    _MAP: ClassVar[dict[str, FP16OptimizerModifier]] = {
+        "megatron.fp16.fp16.FP16_Optimizer": LegacyMegatronLMModifier,
+        "deepspeed.runtime.zero.stage2.FP16_DeepSpeedZeroOptimizer": DeepSpeedZeROModifier,
+        "deepspeed.runtime.zero.stage_1_and_2.DeepSpeedZeroOptimizer": DeepSpeedZeROModifier,
+        "apex.amp.optimizer.unique_name_as_id": ApexAMPModifier,
+    }
+
+    @staticmethod
+    def create_modifier(optimizer_full_qualified_name: str, optimizer, **kwargs) -> FP16OptimizerModifier | None:
+        """Create modifier for optimizer."""
+        if optimizer_full_qualified_name in OptimizerModifierTypeRegistry._MAP:
+            return OptimizerModifierTypeRegistry._MAP[optimizer_full_qualified_name](optimizer, **kwargs)
+
+        if optimizer_full_qualified_name == "accelerate.utils.deepspeed.DeepSpeedOptimizerWrapper":
+            if (
+                hasattr(optimizer, "optimizer")
+                and get_full_qualified_type_name(optimizer.optimizer) in OptimizerModifierTypeRegistry._MAP
+            ):
+                return _AccelerateDeepSpeedZeROModifier(optimizer, **kwargs)
 
-OptimizerModifierTypeRegistry = {
-    "megatron.fp16.fp16.FP16_Optimizer": LegacyMegatronLMModifier,
-    "deepspeed.runtime.zero.stage2.FP16_DeepSpeedZeroOptimizer": DeepSpeedZeROModifier,
-    "deepspeed.runtime.zero.stage_1_and_2.DeepSpeedZeroOptimizer": DeepSpeedZeROModifier,
-    "apex.amp.optimizer.unique_name_as_id": ApexAMPModifier,
-}
+        warnings.warn(
+            "Skip modifying optimizer because of optimizer name not found in the registry: "
+            f"{optimizer_full_qualified_name}",
+            UserWarning,
+        )
+        return None
diff --git a/orttraining/orttraining/python/training/optim/fp16_optimizer.py b/orttraining/orttraining/python/training/optim/fp16_optimizer.py
index 2a5dfbc2189d3..fc93eadc32112 100644
--- a/orttraining/orttraining/python/training/optim/fp16_optimizer.py
+++ b/orttraining/orttraining/python/training/optim/fp16_optimizer.py
@@ -3,9 +3,8 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import warnings
 
-from ._modifier_registry import OptimizerModifierTypeRegistry
+from ._modifier_registry import OptimizerModifierTypeRegistry, get_full_qualified_type_name
 
 
 def FP16_Optimizer(optimizer, **kwargs):  # noqa: N802
@@ -80,22 +79,13 @@ def FP16_Optimizer(optimizer, **kwargs):  # noqa: N802
 
     """
 
-    def get_full_qualified_type_name(o):
-        if hasattr(optimizer, "_amp_stash"):
-            return "apex.amp.optimizer.unique_name_as_id"
-
-        klass = o.__class__
-        module = klass.__module__
-        if module == "builtins":
-            return klass.__qualname__
-        return module + "." + klass.__qualname__
-
-    optimizer_full_qualified_name = get_full_qualified_type_name(optimizer)
-    if optimizer_full_qualified_name not in OptimizerModifierTypeRegistry:
-        warnings.warn("Skip modifying optimizer because of optimizer name not found in registry.", UserWarning)
-        return optimizer
-
-    modifier = OptimizerModifierTypeRegistry[optimizer_full_qualified_name](optimizer, **kwargs)
-    modifier.apply()
+    optimizer_full_qualified_name = (
+        "apex.amp.optimizer.unique_name_as_id"
+        if hasattr(optimizer, "_amp_stash")
+        else get_full_qualified_type_name(optimizer)
+    )
+    modifier = OptimizerModifierTypeRegistry.create_modifier(optimizer_full_qualified_name, optimizer, **kwargs)
+    if modifier is not None:
+        modifier.apply()
 
     return optimizer
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
index b9318033a3d53..dd32e2aced561 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
@@ -245,6 +245,8 @@ def _process_inplace_outputs(
 
                 if not copied:
                     # Only need a copy once.
+                    # Inplace copy only happens for non-leaf variables, so we have to set requires_grad to False.
+                    raw_input_tensor.requires_grad = False
                     raw_input_tensor.copy_(all_outputs_of_kernel_run[output_index])
                     _log_warning(
                         f"{log_prefix}Copy output tensor {output_index} to raw input tensor {raw_tensor_input_index}. "
@@ -449,7 +451,8 @@ def call_python_forward_function(
     try:
         func_name = func_name.decode("utf-8") if isinstance(func_name, bytes) else func_name
         # If this is the first time run, collect runtime tensor reuse mapping.
-        if kernel_invoke_id not in _GlobalOpKernelInfoMap:
+        is_first_time_run = kernel_invoke_id not in _GlobalOpKernelInfoMap
+        if is_first_time_run:
             kernel_info = CustomFuncOpKernelInfo(kernel_invoke_id)
             _GlobalOpKernelInfoMap[kernel_invoke_id] = kernel_info
 
@@ -473,6 +476,11 @@ def call_python_forward_function(
                 if tensor_input_index in inplace_map:
                     raw_input_tensors_used_inplace[tensor_input_index] = wrapped_arg
 
+                # Only requires gradient when running under training mode
+                # and the associated tensor has grad_flag=True (i.e.,
+                # "requires_grad=True" in the original PyTorch script).
+                wrapped_arg.requires_grad = is_training_mode and grad_flag
+
                 # Note1:
                 #   If it's first-time kernel invocation, tensor_input_indices_to_save_in_ctx is None, we do the
                 #   copy for all tensors. Otherwise, we only copy the tensors whose indices are in
@@ -480,29 +488,30 @@ def call_python_forward_function(
                 # Note2:
                 #   For inference mode, we don't need to do the copy because ctx will be None,
                 #   so nothing will be saved for ctx.
-                if is_training_mode and (
-                    tensor_input_indices_to_save_in_ctx is None
-                    or tensor_input_index in tensor_input_indices_to_save_in_ctx
-                ):
-                    wrapped_arg = wrapped_arg.detach().clone()
-
-                # Only requires gradient when running under training mode
-                # and the associated tensor has grad_flag=True (i.e.,
-                # "requires_grad=True" in the original PyTorch script).
-                wrapped_arg.requires_grad = is_training_mode and grad_flag
-
                 # Note3:
-                #   If it's not first-time kernel invocation, tensor_input_indices_for_mark_dirty is None, we do the
-                #   mul for all tensors. Otherwise, we only mul by one for the tensors whose indices are in
-                #   tensor_input_indices_for_mark_dirty.
-                if is_training_mode and (
-                    tensor_input_indices_for_mark_dirty is None
-                    or tensor_input_index in tensor_input_indices_for_mark_dirty
-                ):
-                    # To fix this issue:
-                    # "a leaf Variable that requires grad has been used in an in-place operation."
-                    with torch.set_grad_enabled(True):
-                        wrapped_arg = wrapped_arg.clone()
+                # To fix this issue:
+                # "a leaf Variable that requires grad has been used in an in-place operation."
+                # If it's first-time kernel invocation, tensor_input_indices_for_mark_dirty is None, we do the
+                # copy for all tensors to generate grad for it. Otherwise, we only clone (to generate grad) for
+                # the tensors whose indices are in tensor_input_indices_for_mark_dirty.
+                if is_training_mode:
+                    if is_first_time_run:
+                        with torch.set_grad_enabled(True):
+                            wrapped_arg = wrapped_arg.clone()
+                    else:
+                        is_input_index_saved_in_ctx = (
+                            tensor_input_indices_to_save_in_ctx is None
+                            or tensor_input_index in tensor_input_indices_to_save_in_ctx
+                        )
+                        is_input_index_marked_dirty = (
+                            tensor_input_indices_for_mark_dirty is None
+                            or tensor_input_index in tensor_input_indices_for_mark_dirty
+                        )
+                        if is_input_index_saved_in_ctx or is_input_index_marked_dirty:
+                            # when with grad, the leaf tensor after clone will not be leaf.
+                            with torch.set_grad_enabled(is_input_index_marked_dirty):
+                                wrapped_arg = wrapped_arg.clone()
+                            wrapped_arg.requires_grad = is_training_mode and grad_flag
 
                 wrapped_args.append(wrapped_arg)
                 input_tensors_used_for_fw_run[tensor_input_index] = wrapped_arg
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
index 156c3e001d88f..77317242727b4 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
@@ -271,8 +271,3 @@ def upsample_nearest2d_gradient():
 @register_gradient("org.pytorch.aten", "ATen", "upsample_nearest3d", "vec")
 def upsample_nearest3d_gradient():
     return _upsample_gradient("upsample_nearest3d_backward", 3)
-
-
-@register_gradient("org.pytorch.aten", "ATen", "upsample_bilinear2d", "vec")
-def upsample_bilinear2d_gradient():
-    return _upsample_gradient("upsample_bilinear2d_backward", 2)
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index 64c7abe1c9386..6e694dcdf2e39 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -808,16 +808,3 @@ def upsample_nearest2d(g, input, output_size, scale_factors):
 @register_symbolic("upsample_nearest3d")
 def upsample_nearest3d(g, input, output_size, scale_factors):
     return _upsample_nearest(g, input, output_size, scale_factors, "upsample_nearest3d")
-
-
-@register_symbolic("upsample_bilinear2d")
-def upsample_bilinear2d(g, input, output_size, align_corners, scale_factors):
-    return g.op(
-        "org.pytorch.aten::ATen",
-        input,
-        output_size,
-        align_corners,
-        scale_factors,
-        operator_s="upsample_bilinear2d",
-        overload_name_s="vec",
-    )
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 597801f4030c1..890a1bbccbc92 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -3298,6 +3298,41 @@ TEST(GradientCheckerTest, ConvTransposeGrad) {
   execution_providers.push_back(DefaultCudaExecutionProvider());
   ConvTransposeGradientCheckerTest(&execution_providers);
 }
+
+// TODO: Enable test for ROCM
+TEST(GradientCheckerTest, ResizeGrad) {
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+  const std::vector<ONNX_NAMESPACE::AttributeProto> attributes = {
+      MakeAttribute("coordinate_transformation_mode", "half_pixel"),
+      MakeAttribute("cubic_coeff_a", -0.75f),
+      MakeAttribute("exclude_outside", static_cast<int64_t>(0)),
+      MakeAttribute("extrapolation_value", 0.0f),
+      MakeAttribute("mode", "linear"),
+      MakeAttribute("nearest_mode", "floor")};
+
+  float max_error;
+  GradientChecker<float, float, float> gradient_checker;
+  OpDef op_def{"Resize", kOnnxDomain, 18};
+
+  TensorInfo x_info({1, 2, 4, 4}, true);
+  TensorInfo roi_info({4}, false, nullptr, DataTypeImpl::GetTensorType<float>());
+  TensorInfo scales_info({4}, false, nullptr, DataTypeImpl::GetTensorType<float>());
+
+  TensorInfo y_info({1, 2, 8, 8}, true);
+
+  std::vector<std::vector<float>> x_datas = {{0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
+                                              0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
+                                              0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
+                                              0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f},
+                                             {1.0f, 1.0f, 1.0f, 1.0f},
+                                             {1.0f, 1.0f, 2.0f, 2.0f}};
+
+  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, roi_info, scales_info},
+                                                         {y_info}, &max_error, x_datas, attributes, true, false, &execution_providers));
+  EXPECT_IS_TINY(max_error);
+}
+
 #endif  // USE_CUDA
 
 }  // namespace test
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 643d47b0d043e..c8ec2e52f3078 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -1773,13 +1773,17 @@ def run_step(model, input):
     _test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad)
 
 
-def test_aten_upsample_bilinear():
+@pytest.mark.parametrize("interpolate_size_scale", ({"size": (8, 12)}, {"scale_factor": 4.7}))
+@pytest.mark.parametrize("align_corners", (True, False))
+def test_resize_grad_correctness_bilinear_2d(interpolate_size_scale, align_corners):
     class _NeuralNetUpsampleBilinear(torch.nn.Module):
         def __init__(self):
             super().__init__()
 
         def forward(self, input):
-            return torch.nn.functional.interpolate(input, size=(8, 12), mode="bilinear")
+            return torch.nn.functional.interpolate(
+                input, align_corners=align_corners, mode="bilinear", **interpolate_size_scale
+            )
 
     device = "cuda"
     pt_model = _NeuralNetUpsampleBilinear().to(device)
diff --git a/orttraining/orttraining/test/training_ops/cuda/resize_grad_test.cc b/orttraining/orttraining/test/training_ops/cuda/resize_grad_test.cc
new file mode 100644
index 0000000000000..8fc13af8816be
--- /dev/null
+++ b/orttraining/orttraining/test/training_ops/cuda/resize_grad_test.cc
@@ -0,0 +1,227 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "test/providers/compare_provider_test_utils.h"
+#include "test/providers/provider_test_utils.h"
+#include "test/util/include/default_providers.h"
+
+namespace onnxruntime::test {
+
+#if defined(USE_CUDA) || defined(USE_ROCM)
+
+namespace {
+
+void AddResizeGradAttributes(OpTester& test, const std::string& coordinate_transformation_mode) {
+  test.AddAttribute<std::string>("mode", "linear");
+  test.AddAttribute<std::string>("coordinate_transformation_mode", coordinate_transformation_mode);
+}
+
+}  // namespace
+
+TEST(ResizeGradTest, ResizeGradWithSizes) {
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+#ifdef USE_CUDA
+  providers.emplace_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  providers.emplace_back(DefaultRocmExecutionProvider());
+#endif
+
+  OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
+
+  AddResizeGradAttributes(test, "half_pixel");
+
+  std::vector<float> dY(128, 1.0f);
+  std::vector<int64_t> dY_shape = {1, 2, 8, 8};
+
+  std::vector<float> X(32, 1.0f);
+  std::vector<int64_t> X_shape = {1, 2, 4, 4};
+
+  std::vector<float> dX(32, 4.0f);
+  std::vector<int64_t> dX_shape = X_shape;
+
+  test.AddInput<float>("dY", dY_shape, dY);
+  test.AddInput<float>("X", X_shape, X);
+
+  test.AddOutput<float>("dX", dX_shape, dX);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
+}
+
+TEST(ResizeGradTest, ResizeGradWithSizesHalf) {
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+#ifdef USE_CUDA
+  providers.emplace_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  providers.emplace_back(DefaultRocmExecutionProvider());
+#endif
+
+  OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
+
+  AddResizeGradAttributes(test, "half_pixel");
+
+  std::vector<float> dY(128, 1.0f);
+  std::vector<MLFloat16> dY_half(dY.size());
+  ConvertFloatToMLFloat16(dY.data(), dY_half.data(), static_cast<int>(dY.size()));
+  std::vector<int64_t> dY_shape = {1, 2, 8, 8};
+
+  std::vector<float> X(32, 1.0f);
+  std::vector<MLFloat16> X_half(X.size());
+  ConvertFloatToMLFloat16(X.data(), X_half.data(), static_cast<int>(X.size()));
+  std::vector<int64_t> X_shape = {1, 2, 4, 4};
+
+  std::vector<float> dX(32, 4.0f);
+  std::vector<MLFloat16> dX_half(dX.size());
+  ConvertFloatToMLFloat16(dX.data(), dX_half.data(), static_cast<int>(dX.size()));
+  std::vector<int64_t> dX_shape = X_shape;
+
+  test.AddInput<MLFloat16>("dY", dY_shape, dY_half);
+  test.AddInput<MLFloat16>("X", X_shape, X_half);
+
+  test.AddOutput<MLFloat16>("dX", dX_shape, dX_half);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
+}
+
+TEST(ResizeGradTest, ResizeGradWithSizesAndAlignCorners) {
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+#ifdef USE_CUDA
+  providers.emplace_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  providers.emplace_back(DefaultRocmExecutionProvider());
+#endif
+
+  OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
+
+  AddResizeGradAttributes(test, "align_corners");
+
+  std::vector<float> dY(128, 1.0f);
+  std::vector<int64_t> dY_shape = {1, 2, 8, 8};
+
+  std::vector<float> X(32, 1.0f);
+  std::vector<int64_t> X_shape = {1, 2, 4, 4};
+
+  std::vector<float> dX({2.9388f, 3.9184f, 3.9184f, 2.9388f, 3.9184f, 5.2245f, 5.2245f, 3.9184f,
+                         3.9184f, 5.2245f, 5.2245f, 3.9184f, 2.9388f, 3.9184f, 3.9184f, 2.9388f,
+                         2.9388f, 3.9184f, 3.9184f, 2.9388f, 3.9184f, 5.2245f, 5.2245f, 3.9184f,
+                         3.9184f, 5.2245f, 5.2245f, 3.9184f, 2.9388f, 3.9184f, 3.9184f, 2.9388f});
+  std::vector<int64_t> dX_shape = X_shape;
+
+  test.AddInput<float>("dY", dY_shape, dY);
+  test.AddInput<float>("X", X_shape, X);
+
+  test.AddOutput<float>("dX", dX_shape, dX);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
+}
+
+TEST(ResizeGradTest, ResizeGradWithScales) {
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+#ifdef USE_CUDA
+  providers.emplace_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  providers.emplace_back(DefaultRocmExecutionProvider());
+#endif
+
+  OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
+
+  AddResizeGradAttributes(test, "half_pixel");
+
+  std::vector<float> dY(72, 1.0f);
+  std::vector<int64_t> dY_shape = {1, 2, 6, 6};
+
+  std::vector<float> X(32, 1.0f);
+  std::vector<int64_t> X_shape = {1, 2, 4, 4};
+
+  std::vector<float> dX({2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
+                         2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f,
+                         2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
+                         2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f});
+  std::vector<int64_t> dX_shape = X_shape;
+
+  test.AddInput<float>("dY", dY_shape, dY);
+  test.AddInput<float>("X", X_shape, X);
+  test.AddInput<float>("", {0}, {});
+  test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
+
+  test.AddOutput<float>("dX", dX_shape, dX);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
+}
+
+TEST(ResizeGradTest, ResizeGradWithScalesHalf) {
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+#ifdef USE_CUDA
+  providers.emplace_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  providers.emplace_back(DefaultRocmExecutionProvider());
+#endif
+
+  OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
+
+  AddResizeGradAttributes(test, "half_pixel");
+
+  std::vector<float> dY(72, 1.0f);
+  std::vector<MLFloat16> dY_half(dY.size());
+  ConvertFloatToMLFloat16(dY.data(), dY_half.data(), static_cast<int>(dY.size()));
+  std::vector<int64_t> dY_shape = {1, 2, 6, 6};
+
+  std::vector<float> X(32, 1.0f);
+  std::vector<MLFloat16> X_half(X.size());
+  ConvertFloatToMLFloat16(X.data(), X_half.data(), static_cast<int>(X.size()));
+  std::vector<int64_t> X_shape = {1, 2, 4, 4};
+
+  std::vector<float> dX({2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
+                         2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f,
+                         2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
+                         2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f});
+  std::vector<MLFloat16> dX_half(dX.size());
+  ConvertFloatToMLFloat16(dX.data(), dX_half.data(), static_cast<int>(dX.size()));
+  std::vector<int64_t> dX_shape = X_shape;
+
+  test.AddInput<MLFloat16>("dY", dY_shape, dY_half);
+  test.AddInput<MLFloat16>("X", X_shape, X_half);
+  test.AddInput<float>("", {0}, {});
+  test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
+
+  test.AddOutput<MLFloat16>("dX", dX_shape, dX_half);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
+}
+
+TEST(ResizeGradTest, ResizeGradWithScalesAndAlignCorners) {
+  std::vector<std::unique_ptr<IExecutionProvider>> providers;
+#ifdef USE_CUDA
+  providers.emplace_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+  providers.emplace_back(DefaultRocmExecutionProvider());
+#endif
+
+  OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
+
+  AddResizeGradAttributes(test, "align_corners");
+
+  std::vector<float> dY(72, 1.0f);
+  std::vector<int64_t> dY_shape = {1, 2, 6, 6};
+
+  std::vector<float> X(32, 1.0f);
+  std::vector<int64_t> X_shape = {1, 2, 4, 4};
+
+  std::vector<float> dX({1.9600f, 2.2400f, 2.2400f, 1.9600f, 2.2400f, 2.5600f, 2.5600f, 2.2400f,
+                         2.2400f, 2.5600f, 2.5600f, 2.2400f, 1.9600f, 2.2400f, 2.2400f, 1.9600f,
+                         1.9600f, 2.2400f, 2.2400f, 1.9600f, 2.2400f, 2.5600f, 2.5600f, 2.2400f,
+                         2.2400f, 2.5600f, 2.5600f, 2.2400f, 1.9600f, 2.2400f, 2.2400f, 1.9600f});
+  std::vector<int64_t> dX_shape = X_shape;
+
+  test.AddInput<float>("dY", dY_shape, dY);
+  test.AddInput<float>("X", X_shape, X);
+  test.AddInput<float>("", {0}, {});
+  test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
+
+  test.AddOutput<float>("dX", dX_shape, dX);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
+}
+
+#endif  // defined(USE_CUDA) || defined(USE_ROCM)
+
+}  // namespace onnxruntime::test
diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
index 8e61dbee506f2..ae4f48b6b49a2 100644
--- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
@@ -207,6 +207,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ResizeGrad);
 
 // the kernels within the following ifdef are not included in a build with
 // --enable_training_ops but without --enable_training
@@ -453,13 +456,14 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
 
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, InplaceClipGradNorm)>,
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
-        kCudaExecutionProvider, kMSDomain, 1, float, FakeQuant)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
-        kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuant)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
 // the kernels within the following ifdef are not included in a build with
 // --enable_training_ops but without --enable_training
 #ifdef ENABLE_TRAINING
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/resize_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/resize_grad.cc
new file mode 100644
index 0000000000000..a5e8f7cd35d88
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/tensor/resize_grad.cc
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <memory>
+#include <utility>
+
+#include "orttraining/training_ops/cuda/tensor/resize_grad.h"
+#include "orttraining/training_ops/cuda/tensor/resize_grad_impl.h"
+
+namespace onnxruntime::cuda {
+
+#define REGISTER_RESIZEGRAD_KERNEL_TYPED(T)                                \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                           \
+      ResizeGrad,                                                          \
+      kMSDomain,                                                           \
+      1,                                                                   \
+      T,                                                                   \
+      kCudaExecutionProvider,                                              \
+      (*KernelDefBuilder::Create())                                        \
+          .InputMemoryType(OrtMemTypeCPUInput, 2) /* Keep roi on CPU */    \
+          .InputMemoryType(OrtMemTypeCPUInput, 3) /* Keep scales on CPU */ \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),          \
+      ResizeGrad<T>);
+
+REGISTER_RESIZEGRAD_KERNEL_TYPED(MLFloat16)
+REGISTER_RESIZEGRAD_KERNEL_TYPED(float)
+REGISTER_RESIZEGRAD_KERNEL_TYPED(double)
+
+template <typename T>
+Status ResizeGrad<T>::ComputeInternal(OpKernelContext* context) const {
+  typedef typename ToCudaType<T>::MappedType CudaT;
+
+  const Tensor* dY = context->Input<Tensor>(0);
+  const Tensor* X = context->Input<Tensor>(1);
+  const Tensor* scales = context->Input<Tensor>(3);
+
+  ORT_ENFORCE(X->Shape().NumDimensions() == 4, "Expected input tensor to have 4 dimensions. Actual: ",
+              X->Shape().NumDimensions());
+
+  const auto get_scales_from_input = [](const Tensor* scales) {
+    if (nullptr == scales) {
+      return std::make_pair(std::optional<float>{}, std::optional<float>{});
+    }
+
+    ORT_ENFORCE(scales->Shape().Size() == 4, "There must be a scale for each dimension.");
+
+    const auto* scales_data = scales->Data<float>();
+    return std::make_pair(std::optional<float>{scales_data[2]}, std::optional<float>{scales_data[3]});
+  };
+
+  std::pair<std::optional<float>, std::optional<float>> scale_factors = get_scales_from_input(scales);
+
+  Tensor* dX = context->Output(0, X->Shape());
+
+  const int64_t batch_size = X->Shape()[0];
+  const int64_t num_channels = X->Shape()[1];
+  const int64_t output_height = dY->Shape()[2];
+  const int64_t output_width = dY->Shape()[3];
+  const int64_t input_height = X->Shape()[2];
+  const int64_t input_width = X->Shape()[3];
+
+  if (dX->Shape() == dY->Shape()) {
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dX->MutableDataRaw(), dY->DataRaw(), dY->SizeInBytes(), cudaMemcpyDeviceToDevice));
+    return Status::OK();
+  }
+
+  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes(), Stream(context)));
+
+  const bool align_corners = coordinate_transform_mode_ == ResizeCoordinateTransformationMode::ALIGN_CORNERS;
+  const CudaT* dy_data = reinterpret_cast<const CudaT*>(dY->Data<T>());
+  CudaT* dx_data = reinterpret_cast<CudaT*>(dX->MutableData<T>());
+
+  ResizeGradImpl(Stream(context), input_height, input_width, output_height,
+                 output_width, batch_size, num_channels, align_corners,
+                 scale_factors.first, scale_factors.second,
+                 dy_data, dx_data);
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime::cuda
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/resize_grad.h b/orttraining/orttraining/training_ops/cuda/tensor/resize_grad.h
new file mode 100644
index 0000000000000..53f8d5f0d71f5
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/tensor/resize_grad.h
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
+
+namespace onnxruntime::cuda {
+
+template <typename T>
+class ResizeGrad final : public UpsampleBase, public CudaKernel {
+ public:
+  ResizeGrad(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
+    ORT_ENFORCE(!antialias_, "Antialiasing is not supported in ResizeGrad yet.");
+
+    ORT_ENFORCE(axes_.empty(), "ReizeGrad does not support the `axes` attribute yet.");
+
+    std::string coordinate_transform_mode =
+        info.GetAttrOrDefault<std::string>("coordinate_transformation_mode", "half_pixel");
+    coordinate_transform_mode_ = StringToCoordinateTransformationMode(coordinate_transform_mode);
+    ORT_ENFORCE(coordinate_transform_mode_ == ResizeCoordinateTransformationMode::HALF_PIXEL ||
+                    coordinate_transform_mode_ == ResizeCoordinateTransformationMode::ALIGN_CORNERS,
+                "ReizeGrad only supports the `HALF_PIXEL` and `ALIGN_CORNERS` coordinate_transform_mode ",
+                coordinate_transform_mode, " is not supported yet.");
+
+    ORT_ENFORCE(keep_aspect_ratio_policy_ == AspectRatioPolicy::STRETCH,
+                "ReizeGrad only supports the `STRETCH` policy.");
+
+    std::string mode;
+    ORT_ENFORCE(info.GetAttr<std::string>("mode", &mode).IsOK());
+    ORT_ENFORCE((UpsampleMode::LINEAR == mode_),
+                "ReizeGrad only supports the `LINEAR` mode. ", mode, " mode is not supported yet.");
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace onnxruntime::cuda
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/resize_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/resize_grad_impl.cu
new file mode 100644
index 0000000000000..0507cda62390b
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/tensor/resize_grad_impl.cu
@@ -0,0 +1,151 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Contents of this file are derived from the pytorch cuda implementation of
+// the upsample_bilinear2d_backward implementation at:
+// https://github.com/pytorch/pytorch/blob/ce50132748f652ed6079c3db8008a6817594dbae/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+
+#include "orttraining/training_ops/cuda/tensor/resize_grad_impl.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/atomic/common.cuh"
+
+namespace onnxruntime::cuda {
+
+namespace {
+
+constexpr int NumThreadsPerBlock = GridDim::maxThreadsPerBlock;
+
+}  // namespace
+
+__device__ __forceinline__ size_t
+idx(const size_t nc,
+    const size_t height,
+    const size_t width,
+    const size_t h,
+    const size_t w) {
+  return (nc * height + h) * width + w;
+}
+
+template <typename T>
+__device__ __forceinline__ static T AreaPixelComputeSourceIndex(
+    T scale,
+    int dst_index,
+    bool align_corners,
+    bool cubic) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    T src_idx = scale * (dst_index + static_cast<T>(0.5)) -
+                static_cast<T>(0.5);
+    return (!cubic && src_idx < static_cast<T>(0))
+               ? static_cast<T>(0)
+               : src_idx;
+  }
+}
+
+template <typename T, typename AccT>
+__global__ void UpsampleGrad(const int64_t nc, const int64_t input_height,
+                             const int64_t input_width, const int64_t output_height,
+                             const int64_t output_width, const AccT rheight,
+                             const AccT rwidth, const bool align_corners,
+                             const T* dY_data, T* dX_data) {
+  const size_t dy_numel = nc * output_width * output_height;
+  const size_t dx_numel = nc * input_width * input_height;
+  for (size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+       index < dy_numel;
+       index += blockDim.x * gridDim.x) {
+    size_t index_temp = index;
+    const int w2 = index_temp % output_width;  // 0:width2-1
+    index_temp /= output_width;
+    const int h2 = index_temp % output_height;  // 0:height2-1
+    const size_t nc = index_temp / output_height;
+
+    const AccT h1r = AreaPixelComputeSourceIndex<AccT>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < input_height - 1) ? 1 : 0;
+    const AccT h1lambda = h1r - h1;
+    const AccT h0lambda = static_cast<AccT>(1) - h1lambda;
+
+    const AccT w1r = AreaPixelComputeSourceIndex<AccT>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < input_width - 1) ? 1 : 0;
+    const AccT w1lambda = w1r - w1;
+    const AccT w0lambda = static_cast<AccT>(1) - w1lambda;
+
+    const T d2val = dY_data[index];
+    AtomicAdd(
+        dX_data,
+        idx(nc, input_height, input_width, h1, w1),
+        dx_numel,
+        static_cast<T>(h0lambda * w0lambda) * d2val);
+    AtomicAdd(
+        dX_data,
+        idx(nc, input_height, input_width, h1, w1 + w1p),
+        dx_numel,
+        static_cast<T>(h0lambda * w1lambda) * d2val);
+    AtomicAdd(
+        dX_data,
+        idx(nc, input_height, input_width, h1 + h1p, w1),
+        dx_numel,
+        static_cast<T>(h1lambda * w0lambda) * d2val);
+    AtomicAdd(
+        dX_data,
+        idx(nc, input_height, input_width, h1 + h1p, w1 + w1p),
+        dx_numel,
+        static_cast<T>(h1lambda * w1lambda) * d2val);
+  }
+}
+
+template <typename T>
+T AreaPixelComputeScale(int64_t input_size, int64_t output_size, bool align_corners,
+                        const std::optional<float>& scale) {
+  if (align_corners) {
+    if (output_size <= 1) {
+      return T{0};
+    }
+    return static_cast<T>(input_size - 1) / static_cast<T>(output_size - 1);
+  } else {
+    if (scale.has_value()) {
+      return static_cast<T>(T{1.0} / *scale);
+    } else {
+      return static_cast<T>(input_size) / static_cast<T>(output_size);
+    }
+  }
+}
+
+template <typename T>
+void ResizeGradImpl(cudaStream_t stream, int64_t input_height,
+                    int64_t input_width, int64_t output_height,
+                    int64_t output_width, int64_t batch_size,
+                    int64_t channels, bool align_corners,
+                    const std::optional<float>& scale_height,
+                    const std::optional<float>& scale_width,
+                    const T* dY_data, T* dX_data) {
+  float rheight = AreaPixelComputeScale<float>(input_height, output_height, align_corners, scale_height);
+  float rwidth = AreaPixelComputeScale<float>(input_width, output_width, align_corners, scale_width);
+
+  const size_t output_numel = batch_size * channels * output_height * output_width;
+  int blocks_per_grid = (int)(ceil(static_cast<float>(output_numel) / NumThreadsPerBlock));
+  UpsampleGrad<T><<<blocks_per_grid, NumThreadsPerBlock, 0, stream>>>(
+      batch_size * channels, input_height, input_width, output_height, output_width,
+      rheight, rwidth, align_corners, dY_data, dX_data);
+}
+
+#define SPECIALIZED_RESIZEGRAD_IMPL(T)                                        \
+  template void ResizeGradImpl<T>(cudaStream_t stream, int64_t input_height,  \
+                                  int64_t input_width, int64_t output_height, \
+                                  int64_t output_width, int64_t batch_size,   \
+                                  int64_t channels, bool align_corners,       \
+                                  const std::optional<float>& scale_height,   \
+                                  const std::optional<float>& scale_width,    \
+                                  const T* dY_data, T* dX_data);
+
+SPECIALIZED_RESIZEGRAD_IMPL(half)
+SPECIALIZED_RESIZEGRAD_IMPL(float)
+SPECIALIZED_RESIZEGRAD_IMPL(double)
+
+#undef SPECIALIZED_RESIZEGRAD_IMPL
+
+}  // namespace onnxruntime::cuda
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/resize_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/resize_grad_impl.h
new file mode 100644
index 0000000000000..3e917f9071e30
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/tensor/resize_grad_impl.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <stdint.h>
+#include <optional>
+
+namespace onnxruntime::cuda {
+
+template <typename T>
+void ResizeGradImpl(cudaStream_t stream, int64_t input_height,
+                    int64_t input_width, int64_t output_height,
+                    int64_t output_width, int64_t batch_size,
+                    int64_t channels, bool align_corners,
+                    const std::optional<float>& scale_height,
+                    const std::optional<float>& scale_width,
+                    const T* dY_data, T* dX_data);
+
+}  // namespace onnxruntime::cuda
diff --git a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
index 2321aa23dd6eb..e0749c2fb4d0d 100644
--- a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
@@ -187,6 +187,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad);
 
 #if defined(ORT_USE_NCCL) || defined(USE_MPI)
 // P2P communication operators.
@@ -387,6 +390,9 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
 
 // P2P communication operators.
 #if defined(ORT_USE_NCCL) || defined(USE_MPI)
diff --git a/setup.py b/setup.py
index 2eb8f212d730c..b71836e0ee6e4 100644
--- a/setup.py
+++ b/setup.py
@@ -192,11 +192,15 @@ def run(self):
 
                 cuda_dependencies = [
                     "libcublas.so.11",
+                    "libcublas.so.12",
                     "libcublasLt.so.11",
-                    "libcudnn.so.8",
+                    "libcublasLt.so.12",
                     "libcudart.so.11.0",
-                    "libcurand.so.10",
+                    "libcudart.so.12.0",
+                    "libcudnn.so.8",
                     "libcufft.so.10",
+                    "libcufft.so.11",
+                    "libcurand.so.10",
                 ]
                 rocm_dependencies = [
                     "librccl.so.1",
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index fdd8c09333737..129dbc833a0ab 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -193,7 +193,7 @@ stages:
     DoCompliance: ${{ parameters.DoCompliance }}
     DoEsrp: ${{ parameters.DoEsrp }}
     stage_name_suffix: gpu
-    EnvSetupScript: setup_env_cuda_11.bat
+    EnvSetupScript: setup_env_cuda.bat
     buildArch: x64
     msbuildPlatform: x64
     packageName: x64-cuda
@@ -488,13 +488,13 @@ stages:
         Steps:
         - script: |
             tools/ci_build/get_docker_image.py \
-              --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 \
+              --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda \
               --context tools/ci_build/github/linux/docker \
-              --docker-build-args "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u ) --build-arg BUILD_UID=$( id -u )" \
+              --docker-build-args "--network=host --build-arg BASEIMAGE=nvidia/cuda:11.8.0-devel-ubi8 --build-arg INSTALL_CUDNN=true --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 --build-arg BUILD_UID=$( id -u )" \
               --container-registry onnxruntimebuildcache \
               --multiple_repos \
               --repository onnxruntimecuda118xtrt86build
-          displayName: "Get onnxruntimecuda118xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6"
+          displayName: "Get onnxruntimecuda118xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda"
           workingDirectory: $(Build.SourcesDirectory)/onnxruntime
         ContainerRegistry: onnxruntimebuildcache
 
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 71a580f348f6f..1d4681d064387 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -58,9 +58,15 @@ jobs:
 
   - template: templates/get-docker-image-steps.yml
     parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u )"
+      DockerBuildArgs: "
+      --network=host 
+      --build-arg BASEIMAGE=nvidia/cuda:11.8.0-devel-ubi8 
+      --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 
+      --build-arg INSTALL_CUDNN=true
+      --build-arg BUILD_UID=$( id -u )
+      "
       Repository: onnxruntimecuda11build
 
   - task: Cache@2
@@ -154,9 +160,15 @@ jobs:
 
   - template: templates/get-docker-image-steps.yml
     parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u )"
+      DockerBuildArgs: "
+      --network=host 
+      --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+      --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 
+      --build-arg INSTALL_CUDNN=true
+      --build-arg BUILD_UID=$( id -u )
+      "
       Repository: onnxruntimecuda11build
 
   - task: CmdLine@2
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 9450395f3cf79..16d4457c45eb6 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -57,9 +57,15 @@ jobs:
 
   - template: templates/get-docker-image-steps.yml
     parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u )"
+      DockerBuildArgs: "
+      --network=host
+      --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+      --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8
+      --build-arg INSTALL_CUDNN=true
+      --build-arg BUILD_UID=$( id -u )
+      "
       Repository: onnxruntimetensorrt86gpubuild
 
   - template: templates/linux-build-step-with-cache.yml
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index d24b0e0539631..2a94499c7a268 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -67,7 +67,7 @@ stages:
     - template: templates/jobs/win-ci-vs-2022-job.yml
       parameters:
         BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda_11.bat
+        EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 2161a9205f22d..c8aac6e8b130d 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -30,7 +30,7 @@ stages:
   - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'aarch64'
-      machine_pool: 'aiinfra-linux-ARM64-CPU-2019'
+      machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
       base_image: 'arm64v8/almalinux:8'
       devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
       ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index cf73691a5eecc..9ca4a45ffcec4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.90
+      version: 1.0.95
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.90
+      version: 1.0.95
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index ca5a52fa61ed3..0c8fb91a24a31 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -1,19 +1,27 @@
 parameters:
-- name: EnvSetupScript
-  type: string
-
-- name: DownloadCUDA
-  type: boolean
-  default: false
+  - name: EnvSetupScript
+    type: string
+  - name: DownloadCUDA
+    type: boolean
+    default: false
+  - name: PrimaryCUDAVersion
+    type: string
+    default: '11.8'
+  - name: SecondaryCUDAVersion
+    type: string
+    default: '12.2'
 
 steps:
-- ${{ if eq(parameters.DownloadCUDA, 'true') }}:
-  - powershell: |
-      azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v11.8" $(Agent.TempDirectory)
-
-- task: BatchScript@1
-  displayName: 'setup env'
-  inputs:
-    filename: '$(Build.SourcesDirectory)\tools\ci_build\github\windows\${{ parameters.EnvSetupScript }}'
-    modifyEnvironment: true
-    workingFolder: '$(Build.BinariesDirectory)'
+  - ${{ if eq(parameters.DownloadCUDA, 'true') }}:
+      - powershell: |
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ parameters.PrimaryCUDAVersion }}" $(Agent.TempDirectory)
+        displayName: 'Download Primary CUDA SDK v${{ parameters.PrimaryCUDAVersion }}'
+      - powershell: |
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ parameters.SecondaryCUDAVersion }}" $(Agent.TempDirectory)
+        displayName: 'Download Secondary CUDA SDK v${{ parameters.SecondaryCUDAVersion }}'
+  - task: BatchScript@1
+    displayName: 'setup env'
+    inputs:
+      filename: '$(Build.SourcesDirectory)\tools\ci_build\github\windows\${{ parameters.EnvSetupScript }}'
+      modifyEnvironment: true
+      workingFolder: '$(Build.BinariesDirectory)'
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
index 51d3a9ebc2187..1cc5c48c5513c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
@@ -47,7 +47,7 @@ stages:
       OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -O3 -Wl,--strip-all'
       OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -O3 -Wl,--strip-all'
       OnnxruntimeNodejsBindingArch: 'arm64'
-      PoolName: 'aiinfra-linux-ARM64-CPU-2019'
+      PoolName: 'onnxruntime-linux-ARM64-CPU-2019'
       ArtifactNamePrefix: ${{ parameters.ArtifactNamePrefix }}
       PackageJava: ${{ parameters.PackageJava }}
       PackageNodeJS: ${{ parameters.PackageNodeJS }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
index 445f739e81c45..0d58f6cee4003 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
@@ -44,9 +44,15 @@ stages:
         submodules: recursive
       - template: get-docker-image-steps.yml
         parameters:
-          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
           Context: tools/ci_build/github/linux/docker
-          DockerBuildArgs: "--build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u )"
+          DockerBuildArgs: "
+          --network=host
+          --build-arg BASEIMAGE=nvidia/cuda:11.8.0-devel-ubi8
+          --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8
+          --build-arg INSTALL_CUDNN=true
+          --build-arg BUILD_UID=$( id -u )
+          "
           Repository: onnxruntimecuda118xtrt86build
       - template: set-version-number-variables-step.yml
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
index 3d5a71284fa6f..33c82b5e8965a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
@@ -36,9 +36,16 @@ jobs:
 
     - template: get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
         Context: tools/ci_build/github/linux/docker
-        DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u ) --build-arg PLATFORM=${{ parameters.arch }}"
+        DockerBuildArgs: "
+        --network=host 
+        --build-arg BASEIMAGE=nvidia/cuda:11.8.0-devel-ubi8 
+        --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 
+        --build-arg INSTALL_CUDNN=true
+        --build-arg BUILD_UID=$( id -u )
+        --build-arg PLATFORM=${{ parameters.arch }}
+        "
         Repository: onnxruntimecuda118xtrt86build${{ parameters.arch }}
 
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
index 43ed0172825bc..a70e0c01e52f1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -81,9 +81,16 @@ jobs:
 
   - template: get-docker-image-steps.yml
     parameters:
-      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
-      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u ) --build-arg PLATFORM=${{ parameters.arch }}"
+      DockerBuildArgs: "
+      --network=host 
+      --build-arg BASEIMAGE=nvidia/cuda:11.8.0-devel-ubi8 
+      --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 
+      --build-arg INSTALL_CUDNN=true
+      --build-arg BUILD_UID=$( id -u )
+      --build-arg PLATFORM=${{ parameters.arch }}
+      "
       Repository: onnxruntimecuda118xtrt86build${{ parameters.arch }}
 
   - task: Bash@3
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 1e28ad08a5bdc..1a67ace5e85fa 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -484,7 +484,7 @@ stages:
       - template: py-linux.yml
         parameters:
           arch: 'aarch64'
-          machine_pool: 'aiinfra-linux-ARM64-CPU-2019'
+          machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
           base_image: 'arm64v8/almalinux:8'
           devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
           ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
index 84c910ba58787..a5925d16564fe 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
@@ -20,7 +20,7 @@ parameters:
   default: false
 
 - name: TimeoutInMinutes
-  default: 180
+  default: 240
 
 - name: BuildJsep
   type: boolean
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index 8796917afa37d..ae2a4b4cead3d 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -40,7 +40,7 @@ stages:
     - template: templates/jobs/win-ci-vs-2022-job.yml
       parameters:
         BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda_11.bat
+        EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
@@ -57,7 +57,7 @@ stages:
     - template: templates/jobs/win-ci-vs-2022-job.yml
       parameters:
         BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda_11.bat
+        EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
         msbuildPlatform: x64
@@ -70,30 +70,13 @@ stages:
         MachinePool: onnxruntime-Win2022-GPU-T4
         isTraining: true
 
-- stage: dml
-  dependsOn: []
-  jobs:
-    - template: templates/jobs/win-ci-vs-2022-job.yml
-      parameters:
-        BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env.bat
-        buildArch: x64
-        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
-        msbuildPlatform: x64
-        isX86: false
-        job_name_suffix: x64_RelWithDebInfo
-        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        ORT_EP_NAME: DML
-        WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-dml-A10
-
 - stage: kernelDocumentation
   dependsOn: []
   jobs:
     - template: templates/jobs/win-ci-vs-2022-job.yml
       parameters:
         BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda_11.bat
+        EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
         additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
index b5db8a5201405..d0f9772da7adc 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
@@ -10,7 +10,7 @@ jobs:
         BuildConfig: 'MinSizeRel'
   variables:
     MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
-    EnvSetupScript: setup_env_cuda_11.bat
+    EnvSetupScript: setup_env_cuda.bat
     buildArch: x64
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   timeoutInMinutes: 120
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index 4d9c676674a09..7b2cada736488 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -5,11 +5,10 @@
 ARG POLICY=manylinux_2_28
 ARG PLATFORM=x86_64
 ARG BASEIMAGE=nvidia/cuda:12.2.0-devel-ubi8
-ARG TRT_VERSION=8.6.1.6-1.cuda12.0
 ARG DEVTOOLSET_ROOTPATH=/usr
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64
 ARG PREPEND_PATH=/usr/local/cuda/binet
-
+ARG INSTALL_CUDNN=false
 
 #Build manylinux docker image begin
 FROM $BASEIMAGE AS runtime_base
@@ -118,7 +117,7 @@ RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.10.5
 
 FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.0b5
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
@@ -155,23 +154,35 @@ CMD ["/bin/bash"]
 
 #Build manylinux docker image end
 
-#Install TensorRT 8.6.1.6
-RUN CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p') &&\
-     dnf -y install\
-      libcudnn8-devel-*cuda${CUDA_VERSION}*\
-      libcudnn8-*cuda${CUDA_VERSION}*\
-      libnvinfer8-${TRT_VERSION}\
-      libnvparsers8-${TRT_VERSION}\
-      libnvonnxparsers8-${TRT_VERSION}\
-      libnvinfer-plugin8-${TRT_VERSION}\
-      libnvinfer-vc-plugin8-${TRT_VERSION}\
-      libnvinfer-devel-${TRT_VERSION}\
-      libnvparsers-devel-${TRT_VERSION}\
-      libnvonnxparsers-devel-${TRT_VERSION}\
-      libnvinfer-plugin-devel-${TRT_VERSION}\
-      libnvinfer-vc-plugin-devel-${TRT_VERSION}\
-      libnvinfer-headers-devel-${TRT_VERSION}\
-      libnvinfer-headers-plugin-devel-${TRT_VERSION}
+
+#Install optinal Cudnn
+RUN if [ "$INSTALL_CUDNN" = true ]; then  \
+     CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p') &&  \
+     dnf -y install  \
+     libcudnn8-devel-*cuda${CUDA_VERSION}*  \
+     libcudnn8-*cuda${CUDA_VERSION}* ; \
+fi
+
+#Install TensorRT only if TRT_VERSION is not empty
+RUN if [ -n "$TRT_VERSION" ]; then  \
+    echo "TRT_VERSION is $TRT_VERSION" && \
+    dnf -y install  \
+    libnvinfer8-${TRT_VERSION}  \
+    libnvparsers8-${TRT_VERSION}  \
+    libnvonnxparsers8-${TRT_VERSION}  \
+    libnvinfer-plugin8-${TRT_VERSION}  \
+    libnvinfer-vc-plugin8-${TRT_VERSION}  \
+    libnvinfer-devel-${TRT_VERSION}  \
+    libnvparsers-devel-${TRT_VERSION}  \
+    libnvonnxparsers-devel-${TRT_VERSION}  \
+    libnvinfer-plugin-devel-${TRT_VERSION}  \
+    libnvinfer-vc-plugin-devel-${TRT_VERSION}  \
+    libnvinfer-headers-devel-${TRT_VERSION}  \
+    libnvinfer-headers-plugin-devel-${TRT_VERSION};  \
+else \
+    echo "TRT_VERSION is none skipping Tensor RT Installation" ; \
+fi
+
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 #Add our own dependencies
 ADD scripts /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
deleted file mode 100644
index 933b0211b0e6c..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
+++ /dev/null
@@ -1,166 +0,0 @@
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG POLICY=manylinux_2_28
-ARG PLATFORM=x86_64
-ARG DEVTOOLSET_ROOTPATH=
-ARG LD_LIBRARY_PATH_ARG=
-ARG PREPEND_PATH=
-
-#We need both CUDA and manylinux. But the CUDA Toolkit End User License Agreement says NVIDIA CUDA Driver Libraries(libcuda.so, libnvidia-ptxjitcompiler.so) are only distributable in applications that meet this criteria:
-#1. The application was developed starting from a NVIDIA CUDA container obtained from Docker Hub or the NVIDIA GPU Cloud, and
-#2. The resulting application is packaged as a Docker container and distributed to users on Docker Hub or the NVIDIA GPU Cloud only.
-#So we use CUDA as the base image then add manylinux on top of it.
-
-#Build manylinux2014 docker image begin
-FROM $BASEIMAGE AS runtime_base
-ARG POLICY
-ARG PLATFORM
-ARG DEVTOOLSET_ROOTPATH
-ARG LD_LIBRARY_PATH_ARG
-ARG PREPEND_PATH
-LABEL maintainer="The ManyLinux project"
-
-ENV AUDITWHEEL_POLICY=${POLICY} AUDITWHEEL_ARCH=${PLATFORM} AUDITWHEEL_PLAT=${POLICY}_${PLATFORM}
-ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 LANGUAGE=en_US.UTF-8
-ENV DEVTOOLSET_ROOTPATH=${DEVTOOLSET_ROOTPATH}
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}
-ENV PATH=${PREPEND_PATH}${PATH}
-ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
-
-# first copy the fixup mirrors script, keep the script around
-COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
-
-# setup entrypoint, this will wrap commands with `linux32` with i686 images
-COPY build_scripts/install-entrypoint.sh \
-     build_scripts/build_utils.sh \
-     /build_scripts/
-
-RUN /build_scripts/install-entrypoint.sh && rm -rf /build_scripts
-COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
-ENTRYPOINT ["manylinux-entrypoint"]
-
-COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/build_utils.sh \
-     /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
-
-COPY build_scripts/build_utils.sh /build_scripts/
-
-COPY build_scripts/install-autoconf.sh /build_scripts/
-RUN export AUTOCONF_ROOT=autoconf-2.71 && \
-    export AUTOCONF_HASH=431075ad0bf529ef13cb41e9042c542381103e80015686222b8a9d4abef42a1c && \
-    export AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf && \
-    manylinux-entrypoint /build_scripts/install-autoconf.sh
-
-COPY build_scripts/install-automake.sh /build_scripts/
-RUN export AUTOMAKE_ROOT=automake-1.16.5 && \
-    export AUTOMAKE_HASH=07bd24ad08a64bc17250ce09ec56e921d6343903943e99ccf63bbf0705e34605 && \
-    export AUTOMAKE_DOWNLOAD_URL=http://ftp.gnu.org/gnu/automake && \
-    manylinux-entrypoint /build_scripts/install-automake.sh
-
-COPY build_scripts/install-libtool.sh /build_scripts/
-RUN export LIBTOOL_ROOT=libtool-2.4.7 && \
-    export LIBTOOL_HASH=04e96c2404ea70c590c546eba4202a4e12722c640016c12b9b2f1ce3d481e9a8 && \
-    export LIBTOOL_DOWNLOAD_URL=http://ftp.gnu.org/gnu/libtool && \
-    manylinux-entrypoint /build_scripts/install-libtool.sh
-
-COPY build_scripts/install-libxcrypt.sh /build_scripts/
-RUN export LIBXCRYPT_VERSION=4.4.28 && \
-    export LIBXCRYPT_HASH=db7e37901969cb1d1e8020cb73a991ef81e48e31ea5b76a101862c806426b457 && \
-    export LIBXCRYPT_DOWNLOAD_URL=https://github.com/besser82/libxcrypt/archive && \
-    export PERL_ROOT=perl-5.34.0 && \
-    export PERL_HASH=551efc818b968b05216024fb0b727ef2ad4c100f8cb6b43fab615fa78ae5be9a && \
-    export PERL_DOWNLOAD_URL=https://www.cpan.org/src/5.0 && \
-    manylinux-entrypoint /build_scripts/install-libxcrypt.sh
-
-FROM runtime_base AS build_base
-COPY build_scripts/install-build-packages.sh /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-build-packages.sh
-
-
-FROM build_base AS build_git
-COPY build_scripts/build-git.sh /build_scripts/
-RUN export GIT_ROOT=git-2.36.2 && \
-    export GIT_HASH=6dc2cdea5fb23d823ba4871cc23222c1db31dfbb6d6c6ff74c4128700df57c68 && \
-    export GIT_DOWNLOAD_URL=https://www.kernel.org/pub/software/scm/git && \
-    manylinux-entrypoint /build_scripts/build-git.sh
-
-
-FROM build_base AS build_cpython
-COPY build_scripts/build-sqlite3.sh /build_scripts/
-RUN export SQLITE_AUTOCONF_ROOT=sqlite-autoconf-3390200 && \
-    export SQLITE_AUTOCONF_HASH=852be8a6183a17ba47cee0bbff7400b7aa5affd283bf3beefc34fcd088a239de && \
-    export SQLITE_AUTOCONF_DOWNLOAD_URL=https://www.sqlite.org/2022 && \
-    manylinux-entrypoint /build_scripts/build-sqlite3.sh
-
-COPY build_scripts/build-openssl.sh /build_scripts/
-RUN export OPENSSL_ROOT=openssl-1.1.1q && \
-    export OPENSSL_HASH=d7939ce614029cdff0b6c20f0e2e5703158a489a72b2507b8bd51bf8c8fd10ca && \
-    export OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source && \
-    manylinux-entrypoint /build_scripts/build-openssl.sh
-
-COPY build_scripts/build-cpython.sh /build_scripts/
-
-
-FROM build_cpython AS build_cpython38
-COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.8.13
-
-
-FROM build_cpython AS build_cpython39
-COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.9.13
-
-
-FROM build_cpython AS build_cpython310
-COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.10.5
-
-FROM build_cpython AS build_cpython311
-COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
-
-FROM build_cpython AS all_python
-COPY build_scripts/install-pypy.sh \
-     build_scripts/pypy.sha256 \
-     build_scripts/finalize-python.sh \
-     /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.8 7.3.9
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.9 7.3.9
-COPY --from=build_cpython38 /opt/_internal /opt/_internal/
-COPY --from=build_cpython39 /opt/_internal /opt/_internal/
-COPY --from=build_cpython310 /opt/_internal /opt/_internal/
-COPY --from=build_cpython311 /opt/_internal /opt/_internal/
-RUN manylinux-entrypoint /build_scripts/finalize-python.sh
-
-
-FROM runtime_base
-COPY --from=build_git /manylinux-rootfs /
-COPY --from=build_cpython /manylinux-rootfs /
-COPY --from=all_python /opt/_internal /opt/_internal/
-COPY build_scripts/finalize.sh \
-     build_scripts/python-tag-abi-tag.py \
-     build_scripts/requirements3.8.txt \
-     build_scripts/requirements3.9.txt \
-     build_scripts/requirements3.10.txt \
-     build_scripts/requirements3.11.txt \
-     build_scripts/requirements-base-tools.txt \
-     /build_scripts/
-COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
-RUN manylinux-entrypoint /build_scripts/finalize.sh && rm -rf /build_scripts
-
-ENV SSL_CERT_FILE=/opt/_internal/certs.pem
-
-CMD ["/bin/bash"]
-
-#Build manylinux2014 docker image end
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
-#Add our own dependencies
-ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
-
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-WORKDIR /home/$BUILD_USER
-USER $BUILD_USER
-ENV PATH /usr/local/dotnet:$PATH
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4
deleted file mode 100644
index 003bb2324c049..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4
+++ /dev/null
@@ -1,173 +0,0 @@
-ARG BASEIMAGE=nvidia/cuda:11.6.1-cudnn8-devel-centos7
-ARG POLICY=manylinux2014
-ARG PLATFORM=x86_64
-ARG DEVTOOLSET_ROOTPATH=
-ARG LD_LIBRARY_PATH_ARG=
-ARG PREPEND_PATH=
-
-#We need CUDA, TensorRT and manylinux. But the CUDA Toolkit End User License Agreement says NVIDIA CUDA Driver Libraries(libcuda.so, libnvidia-ptxjitcompiler.so) are only distributable in applications that meet this criteria:
-#1. The application was developed starting from a NVIDIA CUDA container obtained from Docker Hub or the NVIDIA GPU Cloud, and
-#2. The resulting application is packaged as a Docker container and distributed to users on Docker Hub or the NVIDIA GPU Cloud only.
-#So we use CUDA as the base image then add manylinux and TensorRT on top of it.
-
-#Build manylinux2014 docker image begin
-FROM $BASEIMAGE AS runtime_base
-ARG POLICY
-ARG PLATFORM
-ARG DEVTOOLSET_ROOTPATH
-ARG LD_LIBRARY_PATH_ARG
-ARG PREPEND_PATH
-LABEL maintainer="The ManyLinux project"
-
-ENV AUDITWHEEL_POLICY=${POLICY} AUDITWHEEL_ARCH=${PLATFORM} AUDITWHEEL_PLAT=${POLICY}_${PLATFORM}
-ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 LANGUAGE=en_US.UTF-8
-ENV DEVTOOLSET_ROOTPATH=${DEVTOOLSET_ROOTPATH}
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}
-ENV PATH=${PREPEND_PATH}${PATH}
-ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
-
-# first copy the fixup mirrors script, keep the script around
-COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
-
-# setup entrypoint, this will wrap commands with `linux32` with i686 images
-COPY build_scripts/install-entrypoint.sh \
-     build_scripts/build_utils.sh \
-     /build_scripts/
-
-RUN /build_scripts/install-entrypoint.sh && rm -rf /build_scripts
-COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
-ENTRYPOINT ["manylinux-entrypoint"]
-
-COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/build_utils.sh \
-     /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
-
-COPY build_scripts/build_utils.sh /build_scripts/
-
-COPY build_scripts/install-autoconf.sh /build_scripts/
-RUN export AUTOCONF_ROOT=autoconf-2.71 && \
-    export AUTOCONF_HASH=431075ad0bf529ef13cb41e9042c542381103e80015686222b8a9d4abef42a1c && \
-    export AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf && \
-    manylinux-entrypoint /build_scripts/install-autoconf.sh
-
-COPY build_scripts/install-automake.sh /build_scripts/
-RUN export AUTOMAKE_ROOT=automake-1.16.5 && \
-    export AUTOMAKE_HASH=07bd24ad08a64bc17250ce09ec56e921d6343903943e99ccf63bbf0705e34605 && \
-    export AUTOMAKE_DOWNLOAD_URL=http://ftp.gnu.org/gnu/automake && \
-    manylinux-entrypoint /build_scripts/install-automake.sh
-
-COPY build_scripts/install-libtool.sh /build_scripts/
-RUN export LIBTOOL_ROOT=libtool-2.4.7 && \
-    export LIBTOOL_HASH=04e96c2404ea70c590c546eba4202a4e12722c640016c12b9b2f1ce3d481e9a8 && \
-    export LIBTOOL_DOWNLOAD_URL=http://ftp.gnu.org/gnu/libtool && \
-    manylinux-entrypoint /build_scripts/install-libtool.sh
-
-COPY build_scripts/install-libxcrypt.sh /build_scripts/
-RUN export LIBXCRYPT_VERSION=4.4.28 && \
-    export LIBXCRYPT_HASH=db7e37901969cb1d1e8020cb73a991ef81e48e31ea5b76a101862c806426b457 && \
-    export LIBXCRYPT_DOWNLOAD_URL=https://github.com/besser82/libxcrypt/archive && \
-    export PERL_ROOT=perl-5.34.0 && \
-    export PERL_HASH=551efc818b968b05216024fb0b727ef2ad4c100f8cb6b43fab615fa78ae5be9a && \
-    export PERL_DOWNLOAD_URL=https://www.cpan.org/src/5.0 && \
-    manylinux-entrypoint /build_scripts/install-libxcrypt.sh
-
-FROM runtime_base AS build_base
-COPY build_scripts/install-build-packages.sh /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-build-packages.sh
-
-
-FROM build_base AS build_git
-COPY build_scripts/build-git.sh /build_scripts/
-RUN export GIT_ROOT=git-2.36.2 && \
-    export GIT_HASH=6dc2cdea5fb23d823ba4871cc23222c1db31dfbb6d6c6ff74c4128700df57c68 && \
-    export GIT_DOWNLOAD_URL=https://www.kernel.org/pub/software/scm/git && \
-    manylinux-entrypoint /build_scripts/build-git.sh
-
-
-FROM build_base AS build_cpython
-COPY build_scripts/build-sqlite3.sh /build_scripts/
-RUN export SQLITE_AUTOCONF_ROOT=sqlite-autoconf-3390200 && \
-    export SQLITE_AUTOCONF_HASH=852be8a6183a17ba47cee0bbff7400b7aa5affd283bf3beefc34fcd088a239de && \
-    export SQLITE_AUTOCONF_DOWNLOAD_URL=https://www.sqlite.org/2022 && \
-    manylinux-entrypoint /build_scripts/build-sqlite3.sh
-
-COPY build_scripts/build-openssl.sh /build_scripts/
-RUN export OPENSSL_ROOT=openssl-1.1.1q && \
-    export OPENSSL_HASH=d7939ce614029cdff0b6c20f0e2e5703158a489a72b2507b8bd51bf8c8fd10ca && \
-    export OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source && \
-    manylinux-entrypoint /build_scripts/build-openssl.sh
-
-COPY build_scripts/build-cpython.sh /build_scripts/
-
-
-FROM build_cpython AS build_cpython38
-COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.8.13
-
-
-FROM build_cpython AS build_cpython39
-COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.9.13
-
-
-FROM build_cpython AS build_cpython310
-COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.10.5
-
-FROM build_cpython AS build_cpython311
-COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
-
-FROM build_cpython AS all_python
-COPY build_scripts/install-pypy.sh \
-     build_scripts/pypy.sha256 \
-     build_scripts/finalize-python.sh \
-     /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.8 7.3.9
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.9 7.3.9
-COPY --from=build_cpython38 /opt/_internal /opt/_internal/
-COPY --from=build_cpython39 /opt/_internal /opt/_internal/
-COPY --from=build_cpython310 /opt/_internal /opt/_internal/
-COPY --from=build_cpython311 /opt/_internal /opt/_internal/
-RUN manylinux-entrypoint /build_scripts/finalize-python.sh
-
-
-FROM runtime_base
-COPY --from=build_git /manylinux-rootfs /
-COPY --from=build_cpython /manylinux-rootfs /
-COPY --from=all_python /opt/_internal /opt/_internal/
-COPY build_scripts/finalize.sh \
-     build_scripts/python-tag-abi-tag.py \
-     build_scripts/requirements3.8.txt \
-     build_scripts/requirements3.9.txt \
-     build_scripts/requirements3.10.txt \
-     build_scripts/requirements3.11.txt \
-     build_scripts/requirements-base-tools.txt \
-     /build_scripts/
-COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
-RUN manylinux-entrypoint /build_scripts/finalize.sh && rm -rf /build_scripts
-
-ENV SSL_CERT_FILE=/opt/_internal/certs.pem
-
-CMD ["/bin/bash"]
-
-#Build manylinux2014 docker image end
-
-#Install TensorRT 8.4.1.5
-#RUN yum install -y wget
-RUN v="8.4.1-1.cuda11.6" &&\
-    yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo &&\
-    yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} \
-        libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v}
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
-#Add our own dependencies
-ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
-
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-WORKDIR /home/$BUILD_USER
-USER $BUILD_USER
-ENV PATH /usr/local/dotnet:$PATH
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5
deleted file mode 100644
index 0337ffc5e00a0..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5
+++ /dev/null
@@ -1,173 +0,0 @@
-ARG BASEIMAGE=nvidia/cuda:11.6.1-cudnn8-devel-centos7
-ARG POLICY=manylinux2014
-ARG PLATFORM=x86_64
-ARG DEVTOOLSET_ROOTPATH=
-ARG LD_LIBRARY_PATH_ARG=
-ARG PREPEND_PATH=
-
-#We need CUDA, TensorRT and manylinux. But the CUDA Toolkit End User License Agreement says NVIDIA CUDA Driver Libraries(libcuda.so, libnvidia-ptxjitcompiler.so) are only distributable in applications that meet this criteria:
-#1. The application was developed starting from a NVIDIA CUDA container obtained from Docker Hub or the NVIDIA GPU Cloud, and
-#2. The resulting application is packaged as a Docker container and distributed to users on Docker Hub or the NVIDIA GPU Cloud only.
-#So we use CUDA as the base image then add manylinux and TensorRT on top of it.
-
-#Build manylinux2014 docker image begin
-FROM $BASEIMAGE AS runtime_base
-ARG POLICY
-ARG PLATFORM
-ARG DEVTOOLSET_ROOTPATH
-ARG LD_LIBRARY_PATH_ARG
-ARG PREPEND_PATH
-LABEL maintainer="The ManyLinux project"
-
-ENV AUDITWHEEL_POLICY=${POLICY} AUDITWHEEL_ARCH=${PLATFORM} AUDITWHEEL_PLAT=${POLICY}_${PLATFORM}
-ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 LANGUAGE=en_US.UTF-8
-ENV DEVTOOLSET_ROOTPATH=${DEVTOOLSET_ROOTPATH}
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}
-ENV PATH=${PREPEND_PATH}${PATH}
-ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
-
-# first copy the fixup mirrors script, keep the script around
-COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
-
-# setup entrypoint, this will wrap commands with `linux32` with i686 images
-COPY build_scripts/install-entrypoint.sh \
-     build_scripts/build_utils.sh \
-     /build_scripts/
-
-RUN /build_scripts/install-entrypoint.sh && rm -rf /build_scripts
-COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
-ENTRYPOINT ["manylinux-entrypoint"]
-
-COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/build_utils.sh \
-     /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
-
-COPY build_scripts/build_utils.sh /build_scripts/
-
-COPY build_scripts/install-autoconf.sh /build_scripts/
-RUN export AUTOCONF_ROOT=autoconf-2.71 && \
-    export AUTOCONF_HASH=431075ad0bf529ef13cb41e9042c542381103e80015686222b8a9d4abef42a1c && \
-    export AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf && \
-    manylinux-entrypoint /build_scripts/install-autoconf.sh
-
-COPY build_scripts/install-automake.sh /build_scripts/
-RUN export AUTOMAKE_ROOT=automake-1.16.5 && \
-    export AUTOMAKE_HASH=07bd24ad08a64bc17250ce09ec56e921d6343903943e99ccf63bbf0705e34605 && \
-    export AUTOMAKE_DOWNLOAD_URL=http://ftp.gnu.org/gnu/automake && \
-    manylinux-entrypoint /build_scripts/install-automake.sh
-
-COPY build_scripts/install-libtool.sh /build_scripts/
-RUN export LIBTOOL_ROOT=libtool-2.4.7 && \
-    export LIBTOOL_HASH=04e96c2404ea70c590c546eba4202a4e12722c640016c12b9b2f1ce3d481e9a8 && \
-    export LIBTOOL_DOWNLOAD_URL=http://ftp.gnu.org/gnu/libtool && \
-    manylinux-entrypoint /build_scripts/install-libtool.sh
-
-COPY build_scripts/install-libxcrypt.sh /build_scripts/
-RUN export LIBXCRYPT_VERSION=4.4.28 && \
-    export LIBXCRYPT_HASH=db7e37901969cb1d1e8020cb73a991ef81e48e31ea5b76a101862c806426b457 && \
-    export LIBXCRYPT_DOWNLOAD_URL=https://github.com/besser82/libxcrypt/archive && \
-    export PERL_ROOT=perl-5.34.0 && \
-    export PERL_HASH=551efc818b968b05216024fb0b727ef2ad4c100f8cb6b43fab615fa78ae5be9a && \
-    export PERL_DOWNLOAD_URL=https://www.cpan.org/src/5.0 && \
-    manylinux-entrypoint /build_scripts/install-libxcrypt.sh
-
-FROM runtime_base AS build_base
-COPY build_scripts/install-build-packages.sh /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-build-packages.sh
-
-
-FROM build_base AS build_git
-COPY build_scripts/build-git.sh /build_scripts/
-RUN export GIT_ROOT=git-2.36.2 && \
-    export GIT_HASH=6dc2cdea5fb23d823ba4871cc23222c1db31dfbb6d6c6ff74c4128700df57c68 && \
-    export GIT_DOWNLOAD_URL=https://www.kernel.org/pub/software/scm/git && \
-    manylinux-entrypoint /build_scripts/build-git.sh
-
-
-FROM build_base AS build_cpython
-COPY build_scripts/build-sqlite3.sh /build_scripts/
-RUN export SQLITE_AUTOCONF_ROOT=sqlite-autoconf-3390200 && \
-    export SQLITE_AUTOCONF_HASH=852be8a6183a17ba47cee0bbff7400b7aa5affd283bf3beefc34fcd088a239de && \
-    export SQLITE_AUTOCONF_DOWNLOAD_URL=https://www.sqlite.org/2022 && \
-    manylinux-entrypoint /build_scripts/build-sqlite3.sh
-
-COPY build_scripts/build-openssl.sh /build_scripts/
-RUN export OPENSSL_ROOT=openssl-1.1.1q && \
-    export OPENSSL_HASH=d7939ce614029cdff0b6c20f0e2e5703158a489a72b2507b8bd51bf8c8fd10ca && \
-    export OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source && \
-    manylinux-entrypoint /build_scripts/build-openssl.sh
-
-COPY build_scripts/build-cpython.sh /build_scripts/
-
-
-FROM build_cpython AS build_cpython38
-COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.8.13
-
-
-FROM build_cpython AS build_cpython39
-COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.9.13
-
-
-FROM build_cpython AS build_cpython310
-COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.10.5
-
-FROM build_cpython AS build_cpython311
-COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
-
-FROM build_cpython AS all_python
-COPY build_scripts/install-pypy.sh \
-     build_scripts/pypy.sha256 \
-     build_scripts/finalize-python.sh \
-     /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.8 7.3.9
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.9 7.3.9
-COPY --from=build_cpython38 /opt/_internal /opt/_internal/
-COPY --from=build_cpython39 /opt/_internal /opt/_internal/
-COPY --from=build_cpython310 /opt/_internal /opt/_internal/
-COPY --from=build_cpython311 /opt/_internal /opt/_internal/
-RUN manylinux-entrypoint /build_scripts/finalize-python.sh
-
-
-FROM runtime_base
-COPY --from=build_git /manylinux-rootfs /
-COPY --from=build_cpython /manylinux-rootfs /
-COPY --from=all_python /opt/_internal /opt/_internal/
-COPY build_scripts/finalize.sh \
-     build_scripts/python-tag-abi-tag.py \
-     build_scripts/requirements3.8.txt \
-     build_scripts/requirements3.9.txt \
-     build_scripts/requirements3.10.txt \
-     build_scripts/requirements3.11.txt \
-     build_scripts/requirements-base-tools.txt \
-     /build_scripts/
-COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
-RUN manylinux-entrypoint /build_scripts/finalize.sh && rm -rf /build_scripts
-
-ENV SSL_CERT_FILE=/opt/_internal/certs.pem
-
-CMD ["/bin/bash"]
-
-#Build manylinux2014 docker image end
-
-#Install TensorRT 8.5.1.7
-#RUN yum install -y wget
-RUN v="8.5.1-1.cuda11.8" &&\
-    yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo &&\
-    yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} \
-        libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v}
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
-#Add our own dependencies
-ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
-
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-WORKDIR /home/$BUILD_USER
-USER $BUILD_USER
-ENV PATH /usr/local/dotnet:$PATH
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
deleted file mode 100644
index 70765c667ab8e..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
+++ /dev/null
@@ -1,181 +0,0 @@
-# This file is deprecated and will be replaced by tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-ARG BASEIMAGE=nvidia/cuda:11.8.0-devel-ubi8
-ARG POLICY=manylinux_2_28
-ARG PLATFORM=x86_64
-ARG DEVTOOLSET_ROOTPATH=
-ARG LD_LIBRARY_PATH_ARG=
-ARG PREPEND_PATH=
-
-#We need CUDA, TensorRT and manylinux. But the CUDA Toolkit End User License Agreement says NVIDIA CUDA Driver Libraries(libcuda.so, libnvidia-ptxjitcompiler.so) are only distributable in applications that meet this criteria:
-#1. The application was developed starting from a NVIDIA CUDA container obtained from Docker Hub or the NVIDIA GPU Cloud, and
-#2. The resulting application is packaged as a Docker container and distributed to users on Docker Hub or the NVIDIA GPU Cloud only.
-#So we use CUDA as the base image then add manylinux and TensorRT on top of it.
-
-#Build manylinux2014 docker image begin
-FROM $BASEIMAGE AS runtime_base
-ARG POLICY
-ARG PLATFORM
-ARG DEVTOOLSET_ROOTPATH
-ARG LD_LIBRARY_PATH_ARG
-ARG PREPEND_PATH
-LABEL maintainer="The ManyLinux project"
-
-ENV AUDITWHEEL_POLICY=${POLICY} AUDITWHEEL_ARCH=${PLATFORM} AUDITWHEEL_PLAT=${POLICY}_${PLATFORM}
-ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 LANGUAGE=en_US.UTF-8
-ENV DEVTOOLSET_ROOTPATH=${DEVTOOLSET_ROOTPATH}
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}
-ENV PATH=${PREPEND_PATH}${PATH}
-ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
-
-# first copy the fixup mirrors script, keep the script around
-COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
-
-# setup entrypoint, this will wrap commands with `linux32` with i686 images
-COPY build_scripts/install-entrypoint.sh \
-     build_scripts/build_utils.sh \
-     /build_scripts/
-
-RUN /build_scripts/install-entrypoint.sh && rm -rf /build_scripts
-COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
-ENTRYPOINT ["manylinux-entrypoint"]
-
-COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/build_utils.sh \
-     /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
-
-COPY build_scripts/build_utils.sh /build_scripts/
-
-COPY build_scripts/install-autoconf.sh /build_scripts/
-RUN export AUTOCONF_ROOT=autoconf-2.71 && \
-    export AUTOCONF_HASH=431075ad0bf529ef13cb41e9042c542381103e80015686222b8a9d4abef42a1c && \
-    export AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf && \
-    manylinux-entrypoint /build_scripts/install-autoconf.sh
-
-COPY build_scripts/install-automake.sh /build_scripts/
-RUN export AUTOMAKE_ROOT=automake-1.16.5 && \
-    export AUTOMAKE_HASH=07bd24ad08a64bc17250ce09ec56e921d6343903943e99ccf63bbf0705e34605 && \
-    export AUTOMAKE_DOWNLOAD_URL=http://ftp.gnu.org/gnu/automake && \
-    manylinux-entrypoint /build_scripts/install-automake.sh
-
-COPY build_scripts/install-libtool.sh /build_scripts/
-RUN export LIBTOOL_ROOT=libtool-2.4.7 && \
-    export LIBTOOL_HASH=04e96c2404ea70c590c546eba4202a4e12722c640016c12b9b2f1ce3d481e9a8 && \
-    export LIBTOOL_DOWNLOAD_URL=http://ftp.gnu.org/gnu/libtool && \
-    manylinux-entrypoint /build_scripts/install-libtool.sh
-
-COPY build_scripts/install-libxcrypt.sh /build_scripts/
-RUN export LIBXCRYPT_VERSION=4.4.28 && \
-    export LIBXCRYPT_HASH=db7e37901969cb1d1e8020cb73a991ef81e48e31ea5b76a101862c806426b457 && \
-    export LIBXCRYPT_DOWNLOAD_URL=https://github.com/besser82/libxcrypt/archive && \
-    export PERL_ROOT=perl-5.34.0 && \
-    export PERL_HASH=551efc818b968b05216024fb0b727ef2ad4c100f8cb6b43fab615fa78ae5be9a && \
-    export PERL_DOWNLOAD_URL=https://www.cpan.org/src/5.0 && \
-    manylinux-entrypoint /build_scripts/install-libxcrypt.sh
-
-FROM runtime_base AS build_base
-COPY build_scripts/install-build-packages.sh /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-build-packages.sh
-
-
-FROM build_base AS build_git
-COPY build_scripts/build-git.sh /build_scripts/
-RUN export GIT_ROOT=git-2.36.2 && \
-    export GIT_HASH=6dc2cdea5fb23d823ba4871cc23222c1db31dfbb6d6c6ff74c4128700df57c68 && \
-    export GIT_DOWNLOAD_URL=https://www.kernel.org/pub/software/scm/git && \
-    manylinux-entrypoint /build_scripts/build-git.sh
-
-
-FROM build_base AS build_cpython
-COPY build_scripts/build-sqlite3.sh /build_scripts/
-RUN export SQLITE_AUTOCONF_ROOT=sqlite-autoconf-3390200 && \
-    export SQLITE_AUTOCONF_HASH=852be8a6183a17ba47cee0bbff7400b7aa5affd283bf3beefc34fcd088a239de && \
-    export SQLITE_AUTOCONF_DOWNLOAD_URL=https://www.sqlite.org/2022 && \
-    manylinux-entrypoint /build_scripts/build-sqlite3.sh
-
-COPY build_scripts/build-openssl.sh /build_scripts/
-RUN export OPENSSL_ROOT=openssl-1.1.1q && \
-    export OPENSSL_HASH=d7939ce614029cdff0b6c20f0e2e5703158a489a72b2507b8bd51bf8c8fd10ca && \
-    export OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source && \
-    manylinux-entrypoint /build_scripts/build-openssl.sh
-
-COPY build_scripts/build-cpython.sh /build_scripts/
-
-
-FROM build_cpython AS build_cpython37
-COPY build_scripts/cpython-pubkeys.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.7.13
-
-
-FROM build_cpython AS build_cpython38
-COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.8.13
-
-
-FROM build_cpython AS build_cpython39
-COPY build_scripts/ambv-pubkey.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.9.13
-
-
-FROM build_cpython AS build_cpython310
-COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.10.5
-
-FROM build_cpython AS build_cpython311
-COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
-RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.0b5
-
-FROM build_cpython AS all_python
-COPY build_scripts/install-pypy.sh \
-     build_scripts/pypy.sha256 \
-     build_scripts/finalize-python.sh \
-     /build_scripts/
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.7 7.3.9
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.8 7.3.9
-RUN manylinux-entrypoint /build_scripts/install-pypy.sh 3.9 7.3.9
-COPY --from=build_cpython37 /opt/_internal /opt/_internal/
-COPY --from=build_cpython38 /opt/_internal /opt/_internal/
-COPY --from=build_cpython39 /opt/_internal /opt/_internal/
-COPY --from=build_cpython310 /opt/_internal /opt/_internal/
-COPY --from=build_cpython311 /opt/_internal /opt/_internal/
-RUN manylinux-entrypoint /build_scripts/finalize-python.sh
-
-
-FROM runtime_base
-COPY --from=build_git /manylinux-rootfs /
-COPY --from=build_cpython /manylinux-rootfs /
-COPY --from=all_python /opt/_internal /opt/_internal/
-COPY build_scripts/finalize.sh \
-     build_scripts/python-tag-abi-tag.py \
-     build_scripts/requirements3.7.txt \
-     build_scripts/requirements3.8.txt \
-     build_scripts/requirements3.9.txt \
-     build_scripts/requirements3.10.txt \
-     build_scripts/requirements3.11.txt \
-     build_scripts/requirements-base-tools.txt \
-     /build_scripts/
-COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
-RUN manylinux-entrypoint /build_scripts/finalize.sh && rm -rf /build_scripts
-
-ENV SSL_CERT_FILE=/opt/_internal/certs.pem
-
-CMD ["/bin/bash"]
-
-#Build manylinux2014 docker image end
-
-#Install TensorRT 8.6.1.6
-RUN v="8.6.1.6-1.cuda11.8" && CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p') \
-    && dnf -y install libcudnn8-devel-*cuda$CUDA_VERSION* libcudnn8-*cuda$CUDA_VERSION* libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-vc-plugin8-${v}\
-        libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v} libnvinfer-vc-plugin-devel-${v} libnvinfer-headers-devel-${v}  libnvinfer-headers-plugin-devel-${v}
-ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
-#Add our own dependencies
-ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
-
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-WORKDIR /home/$BUILD_USER
-USER $BUILD_USER
-ENV PATH /usr/local/dotnet:$PATH
-ENV CUDA_MODULE_LOADING "LAZY"
diff --git a/tools/ci_build/github/windows/setup_env_cuda_11.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
similarity index 53%
rename from tools/ci_build/github/windows/setup_env_cuda_11.bat
rename to tools/ci_build/github/windows/setup_env_cuda.bat
index 1308e43a4f6db..96569cbe0f648 100644
--- a/tools/ci_build/github/windows/setup_env_cuda_11.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@@ -6,4 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ {
 } else {
     set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH%
 }
+@REM The default version is still cuda v11.8, because set cuda v12.2 after it
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ {
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
+} else {
+    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
+}
 set GRADLE_OPTS=-Dorg.gradle.daemon=false